diff --git a/Makefile.am b/Makefile.am
index f777f20c02953a750e90ac8e7767e82377d52cff..e46a9e3fbcd6bd0a4d675a31801856f0afde39e5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,8 +4,8 @@ SUBDIRS =
 DIST_SUBDIRS =
 
 if BUILD_S_OMP
-SUBDIRS += sPuReMD
-DIST_SUBDIRS += sPuReMD
+SUBDIRS += PuReMD-GPU
+DIST_SUBDIRS += PuReMD-GPU
 endif
 if BUILD_MPI
 SUBDIRS += PuReMD
diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index 3ab7bdba493aa8a5d56c64a100239eeff99d4563..f57bed636093bf0d5a822a4c77d5861524705e26 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -1,7 +1,9 @@
 ACLOCAL_AMFLAGS = -I ../m4
 
+if USE_CUDA
 SUFFIXES = .cu
 include ../cuda.am
+endif
 
 AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing -m64
 AM_CPPFLAGS =
@@ -18,21 +20,35 @@ NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict
 #NVCCFLAGS += -Xcompiler -fPIC -dc
 #NVCCFLAGS += --ptxas-options -v
 
-bin_PROGRAMS = bin/puremd-gpu
-bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
-	src/restart.c src/param.c src/pdb_tools.c src/box.c \
-	src/lin_alg.c src/QEq.c src/allocate.c src/bond_orders.c \
+bin_PROGRAMS = bin/spuremd
+bin_spuremd_SOURCES = src/analyze.c src/print_utils.c \
+	src/restart.c src/tool_box.c src/control.c src/ffield.c \
+	src/geo_tools.c src/box.c \
+	src/lin_alg.c src/qeq.c src/allocate.c src/bond_orders.c \
 	src/forces.c src/four_body_interactions.c \
 	src/grid.c src/init_md.c src/integrate.c src/list.c \
 	src/lookup.c src/neighbors.c \
 	src/reset_utils.c src/single_body_interactions.c \
 	src/system_props.c src/three_body_interactions.c \
 	src/traj.c src/two_body_interactions.c src/vector.c \
-	src/testmd.c \
-	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \
-	src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \
+	src/testmd.c
+include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
+        src/restart.h src/tool_box.c src/control.h src/ffield.c \
+	src/geo_tools.h src/box.h \
+	src/lin_alg.h src/qeq.h src/allocate.h src/bond_orders.h \
+	src/forces.h src/four_body_interactions.h \
+	src/grid.h src/init_md.h src/integrate.h src/list.h \
+	src/lookup.h src/neighbors.h \
+	src/reset_utils.h src/single_body_interactions.h \
+	src/system_props.h src/three_body_interactions.h \
+	src/traj.h src/two_body_interactions.h src/vector.h
+
+if USE_CUDA
+bin_spuremd_SOURCES += src/cuda_utils.cu src/cuda_copy.cu \
+	src/cuda_init.cu src/cuda_reduction.cu \
+	src/cuda_center_mass.cu src/cuda_box.cu src/cuda_validation.cu \
         src/cuda_allocate.cu src/cuda_bond_orders.cu \
-	src/cuda_lin_alg.cu src/cuda_QEq.cu \
+	src/cuda_lin_alg.cu src/cuda_qeq.cu \
         src/cuda_forces.cu src/cuda_four_body_interactions.cu \
 	src/cuda_grid.cu src/cuda_init_md.cu src/cuda_integrate.cu src/cuda_list.cu \
 	src/cuda_lookup.cu src/cuda_neighbors.cu \
@@ -40,19 +56,11 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
         src/cuda_system_props.cu src/cuda_three_body_interactions.cu \
 	src/cuda_two_body_interactions.cu src/cuda_environment.cu \
 	src/cuda_post_evolve.cu
-include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
-        src/restart.h src/param.h src/pdb_tools.h src/box.h \
-	src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \
-	src/forces.h src/four_body_interactions.h \
-	src/grid.h src/init_md.h src/integrate.h src/list.h \
-	src/lookup.h src/neighbors.h \
-	src/reset_utils.h src/single_body_interactions.h \
-	src/system_props.h src/three_body_interactions.h \
-	src/traj.h src/two_body_interactions.h src/vector.h \
-	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/cuda_reduction.h \
-	src/cuda_center_mass.h src/cuda_box.h src/validation.h \
+include_HEADERS += src/cuda_utils.h src/cuda_copy.h \
+	src/cuda_init.h src/cuda_reduction.h \
+	src/cuda_center_mass.h src/cuda_box.h src/cuda_validation.h \
         src/cuda_allocate.h src/cuda_bond_orders.h \
-	src/cuda_lin_alg.h src/cuda_QEq.h \
+	src/cuda_lin_alg.h src/cuda_qeq.h \
         src/cuda_forces.h src/cuda_four_body_interactions.h \
 	src/cuda_grid.h src/cuda_init_md.h src/cuda_integrate.h src/cuda_list.h \
 	src/cuda_lookup.h src/cuda_neighbors.h \
@@ -62,8 +70,16 @@ include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
 	src/cuda_post_evolve.h
 
 # dummy source to cause C linking
-nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c
+nodist_EXTRA_bin_spuremd_SOURCES = src/dummy.c
+
+endif
+
+
+bin_spuremd_CFLAGS = $(AM_CFLAGS) $(CFLAGS)
+bin_spuremd_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
+bin_spuremd_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS)
 
-bin_puremd_gpu_CFLAGS = $(AM_CFLAGS) $(CFLAGS) $(CUDA_CFLAGS)
-bin_puremd_gpu_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
-bin_puremd_gpu_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS) $(CUDA_LIBS)
+if USE_CUDA
+bin_spuremd_CFLAGS += $(CUDA_CFLAGS)
+bin_spuremd_LDFLAGS += $(CUDA_LIBS)
+endif
diff --git a/PuReMD-GPU/aclocal.m4 b/PuReMD-GPU/aclocal.m4
index 2e1d098d2159d3a3069bc44cc5e0942cb9e86070..d6bf5baa543b24a4c3d8f9fc06ee2020a1d3f9bf 100644
--- a/PuReMD-GPU/aclocal.m4
+++ b/PuReMD-GPU/aclocal.m4
@@ -1150,4 +1150,5 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
+m4_include([../m4/ax_compiler_vendor.m4])
 m4_include([../m4/ax_cuda.m4])
diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac
index 38c7cf737e44056c612a3b48f55708486ab78af4..5d0bca878282d6b7181c47328048ad590cd1afbf 100644
--- a/PuReMD-GPU/configure.ac
+++ b/PuReMD-GPU/configure.ac
@@ -53,47 +53,137 @@ AC_CHECK_TYPES([gzFile], [],
 # Checks for library functions.
 AC_FUNC_MALLOC
 AC_FUNC_STRTOD
-AC_CHECK_FUNCS([memset pow sqrt])
+AC_CHECK_FUNCS([gettimeofday memset pow sqrt])
+
+# Check for compiler vendor
+AX_COMPILER_VENDOR
+if test "x$ax_cv_c_compiler_vendor" = "xgnu"; then
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing"
+	else
+		CFLAGS="$CFLAGS -Wall"
+	fi
+fi
+if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -fast"
+	fi
+fi
 
-# Check for CUDA support.
-CONFIGURE_HEADLINE([ CUDA support ])
-AX_CUDA
-NVCCFLAGS=
-if test "BUILD_DEBUG" = "true"
+# Check for OpenMP support.
+if test "x$BUILD_OPENMP" = "xyes"; then
+	AC_OPENMP
+	if test "x${OPENMP_CFLAGS}" = "x"; then
+		AC_MSG_WARN([
+	  -----------------------------------------------
+	   Unable to find OpenMP support on this system.
+	   Building a single-threaded version.
+	  -----------------------------------------------])
+	else
+		# bug due to recent Intel compiler change (?)
+		if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
+			OPENMP_CFLAGS="-qopenmp"
+		fi
+		AC_SUBST(AM_CFLAGS, "$OPENMP_CFLAGS")
+		AC_SUBST(AM_CPPFLAGS, "$OPENMP_CFLAGS")
+	fi
+fi
+
+if test "x$BUILD_SUPERLU_MT" != "x"
 then
-	NVCCFLAGS+=" -g -G"
+	CPPFLAGS="${CPPFLAGS} -I${BUILD_SUPERLU_MT}/include"
+	LDFLAGS="${LDFLAGS} -L${BUILD_SUPERLU_MT}/lib"
+	#TODO: implement better BLAS detection
+	LIBS="${LIBS} -lblas"
+#	BLAS_FOUND_LIBS="yes"
+#	AC_SEARCH_LIBS([dtrsv_], [blas blas_OPENMP],
+#		        [], [BLAS_FOUND_LIBS="no"], [])
+#	AS_IF([test "x${BLAS_FOUND_LIBS}" != "xyes"],
+#	      [AC_MSG_ERROR([Unable to find BLAS library.])])
+	AC_CHECK_HEADERS([slu_mt_ddefs.h], [SUPERLU_MT_FOUND_HEADERS="yes"])
+	AS_IF([test "x${SUPERLU_MT_FOUND_HEADERS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT headers.])])
+	SUPERLU_MT_FOUND_LIBS="yes"
+	#TODO: fix issue where multiple -l flags added, one for each call below
+	AC_SEARCH_LIBS([intMalloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([get_perm_c], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([pdgstrf_init], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pdgstrf], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pxgstrf_finalize], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([StatAlloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatInit], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatFree], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_SuperNode_SCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_CompCol_NCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AS_IF([test "x${SUPERLU_MT_FOUND_LIBS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT library.])])
+	AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.])
 fi
-AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
-
-AC_CHECK_LIB([cuda], [cuGetErrorString])
-AC_CHECK_LIB([cudart], [cudaMalloc])
-AC_CHECK_LIB([cublas], [cublasDnrm2])
-AC_CHECK_LIB([cusparse], [cusparseCreateMatDescr])
-#AC_SEARCH_LIBS([cublasDaxpy], [cublas])
-#AC_SEARCH_LIBS([cublasDscal], [cublas])
-#AC_SEARCH_LIBS([cublasDdot], [cublas])
-#AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
-#AC_SEARCH_LIBS([cudaGetLastError], [cudart])
-#AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
-#AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
-#AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
+
+# Check for CUDA support.
+if test "x$BUILD_GPU" != "x"
+then
+	CONFIGURE_HEADLINE([ CUDA support ])
+	AX_CUDA
+	NVCCFLAGS=
+	if test "BUILD_DEBUG" = "true"
+	then
+		NVCCFLAGS+=" -g -G"
+	fi
+	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
+
+	AC_CHECK_LIB([cuda], [cuGetErrorString])
+	AC_CHECK_LIB([cudart], [cudaMalloc])
+	AC_CHECK_LIB([cublas], [cublasDnrm2])
+	AC_CHECK_LIB([cusparse], [cusparseCreateMatDescr])
+#	AC_SEARCH_LIBS([cublasDaxpy], [cublas])
+#	AC_SEARCH_LIBS([cublasDscal], [cublas])
+#	AC_SEARCH_LIBS([cublasDdot], [cublas])
+#	AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
+#	AC_SEARCH_LIBS([cudaGetLastError], [cudart])
+#	AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
+#	AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
+#	AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
 #
-#AC_SEARCH_LIBS([cublasDnrm2], [cublas],
-#	[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
-#AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
-#	[AC_MSG_ERROR([Unable to find CUBLAS library.])])
+#	AC_SEARCH_LIBS([cublasDnrm2], [cublas],
+#		[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
+#	AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
+#		[AC_MSG_ERROR([Unable to find CUBLAS library.])])
 #
-#AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
-#	[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
-#AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
-#	[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
+#	AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
+#		[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
+#	AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
+#		[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
 #
-#AC_CHECK_TYPES([cublasHandle_t], [], 
-#	       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
-#AC_CHECK_TYPES([cusparseHandle_t], [], 
-#	       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#AC_CHECK_TYPES([cusparseMatDescr_t], [], 
-#	       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+#	AC_CHECK_TYPES([cublasHandle_t], [], 
+#		       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
+#	AC_CHECK_TYPES([cusparseHandle_t], [], 
+#		       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+#	AC_CHECK_TYPES([cusparseMatDescr_t], [], 
+#		       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+else
+	AM_CONDITIONAL(USE_CUDA, test "x" = "xyes")
+fi
+
+if test "x$BUILD_DEBUG" != "x"
+then
+	CFLAGS="${CFLAGS} ${DEBUG_FLAGS}"
+fi
+
+if test "x$BUILD_GPROF" != "x"
+then
+	CFLAGS="${CFLAGS} ${GPROF_FLAGS}"
+fi
 
 if test "BUILD_PROF" = "true"
 then
diff --git a/PuReMD-GPU/src/QEq.c b/PuReMD-GPU/src/QEq.c
deleted file mode 100644
index 8cc638ea90dcc25f86d33f275b162c8e531d82bb..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/QEq.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "QEq.h"
-
-#include "allocate.h"
-#include "lin_alg.h"
-#include "list.h"
-#include "print_utils.h"
-#include "index_utils.h"
-#include "system_props.h"
-
-#include "sort.h"
-
-
-int compare_matrix_entry(const void *v1, const void *v2)
-{
-    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
-}
-
-
-void Sort_Matrix_Rows( sparse_matrix *A )
-{
-    int i, si, ei;
-
-    for( i = 0; i < A->n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1];
-        qsort( &(A->entries[si]), ei - si, 
-                sizeof(sparse_matrix_entry), compare_matrix_entry );
-    }
-}
-
-
-void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
-{
-    int i, j, k;
-    real val;
-
-    /* init droptol to 0 */
-    for( i = 0; i < A->n; ++i )
-        droptol[i] = 0;
-
-    /* calculate sqaure of the norm of each row */
-    for( i = 0; i < A->n; ++i ) {
-        for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
-            j = A->entries[k].j;
-            val = A->entries[k].val;
-
-            droptol[i] += val*val;
-            droptol[j] += val*val;
-        }
-
-        val = A->entries[k].val; // diagonal entry
-        droptol[i] += val*val;
-    }
-
-    /* calculate local droptol for each row */
-    //fprintf( stderr, "droptol: " );
-    for( i = 0; i < A->n; ++i ) {
-        //fprintf( stderr, "%f-->", droptol[i] );
-        droptol[i] = SQRT( droptol[i] ) * dtol;
-        //fprintf( stderr, "%f  ", droptol[i] );
-    }
-    //fprintf( stderr, "\n" );
-}
-
-
-int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
-{
-    int i, j, pj;
-    int fillin;
-    real val;
-
-    fillin = 0;
-
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i )
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-
-            if( fabs(val) > droptol[i] )
-                ++fillin;
-        }
-
-    return fillin + A->n;
-}
-
-
-void ICHOLT( sparse_matrix *A, real *droptol, 
-        sparse_matrix *L, sparse_matrix *U )
-{
-    sparse_matrix_entry tmp[1000];
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val;
-    int *Utop;
-
-    Utop = (int*) malloc((A->n+1) * sizeof(int));
-
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    for( i = 0; i <= A->n; ++i )
-        L->start[i] = U->start[i] = 0;
-
-    for( i = 0; i < A->n; ++i )
-        Utop[i] = 0;
-
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i ){
-        L->start[i] = Ltop;
-        tmptop = 0;
-
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-
-            if( fabs(val) > droptol[i] ){
-                k1 = 0;
-                k2 = L->start[j];
-                while( k1 < tmptop && k2 < L->start[j+1] ){
-                    if( tmp[k1].j < L->entries[k2].j )
-                        ++k1;
-                    else if( tmp[k1].j > L->entries[k2].j )
-                        ++k2;
-                    else
-                        val -= (tmp[k1++].val * L->entries[k2++].val);
-                }
-
-                // L matrix is lower triangular, 
-                // so right before the start of next row comes jth diagonal
-                val /= L->entries[L->start[j+1]-1].val;
-
-                tmp[tmptop].j = j;
-                tmp[tmptop].val = val;
-                ++tmptop;
-            }
-            //fprintf( stderr, " -- done\n" );
-        }
-
-        // compute the ith diagonal in L
-        // sanity check
-        if( A->entries[pj].j != i ) {
-            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-            exit(999);
-        }
-
-        val = A->entries[pj].val;
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            val -= (tmp[k1].val * tmp[k1].val);
-
-        tmp[tmptop].j = i;
-        tmp[tmptop].val = SQRT(val);
-
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-                L->entries[Ltop].j = tmp[k1].j;
-                L->entries[Ltop].val = tmp[k1].val;
-                U->start[tmp[k1].j+1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        // keep the diagonal in any case
-        L->entries[Ltop].j = tmp[k1].j;
-        L->entries[Ltop].val = tmp[k1].val;
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-
-    L->start[i] = Ltop;
-    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-    for( i = 1; i <= U->n; ++i )
-        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-
-    for( i = 0; i < L->n; ++i )
-        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-            j = L->entries[pj].j;
-            U->entries[Utop[j]].j = i;
-            U->entries[Utop[j]].val = L->entries[pj].val;
-            Utop[j]++;
-        }
-
-    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-}
-
-
-void Init_MatVec( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list *far_nbrs )
-{
-    int i, fillin;
-    real s_tmp, t_tmp;
-    //char fname[100];
-
-    if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL))
-    {
-        //Print_Linear_System( system, control, workspace, data->step );
-        Sort_Matrix_Rows( &workspace->H );
-
-        //fprintf( stderr, "H matrix sorted\n" );
-
-        Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
-        //fprintf( stderr, "drop tolerances calculated\n" );
-
-        if( workspace->L.entries == NULL )
-        {
-            fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
-
-#ifdef __DEBUG_CUDA__
-            fprintf( stderr, "fillin = %d\n", fillin );
-#endif
-
-            if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
-                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
-            {
-                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-                exit(INSUFFICIENT_SPACE);
-            }
-
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                    fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
-#endif
-        }
-
-        ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "icholt-" );
-        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-        //Print_Sparse_Matrix2( workspace->L, fname );
-        //Print_Sparse_Matrix( U );
-#endif
-    }
-
-    /* extrapolation for s & t */
-    for( i = 0; i < system->N; ++i ) {
-        // no extrapolation
-        //s_tmp = workspace->s[0][i];
-        //t_tmp = workspace->t[0][i];
-
-        // linear
-        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-
-        // quadratic
-        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-        t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] + 3*(workspace->t[index_wkspace_sys(0,i,system->N)]-workspace->t[index_wkspace_sys(1,i,system->N)]);
-
-        // cubic
-        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) - 
-            (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)] );
-        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
-
-        // 4th order
-        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-
-        workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)];
-        workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; 
-        workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)];
-        workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)];
-        workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp;
-
-        workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)];
-        workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; 
-        workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)];
-        workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)];
-        workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp;
-    }
-}
-
-
-void Calculate_Charges( reax_system *system, static_storage *workspace )
-{
-    int i;
-    real u, s_sum, t_sum;
-
-    s_sum = t_sum = 0.;
-    for( i = 0; i < system->N; ++i ) {
-        s_sum += workspace->s[index_wkspace_sys(0,i,system->N)];
-        t_sum += workspace->t[index_wkspace_sys(0,i,system->N)];
-    }
-
-    u = s_sum / t_sum;
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
-#endif
-
-    for( i = 0; i < system->N; ++i )
-    {
-        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)] - u * workspace->t[index_wkspace_sys(0,i,system->N)];
-    }
-}
-
-
-void QEq( reax_system *system, control_params *control, simulation_data *data, 
-        static_storage *workspace, list *far_nbrs, 
-        output_controls *out_control )
-{
-    int matvecs;
-
-    //real t_start, t_elapsed;
-
-    //t_start = Get_Time ();
-    Init_MatVec( system, control, data, workspace, far_nbrs );
-    //t_elapsed = Get_Timing_Info ( t_start );
-
-    //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
-
-    //if( data->step % 10 == 0 )
-    //  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
-
-    //t_start = Get_Time ( );
-    matvecs = GMRES( workspace, &workspace->H, 
-            workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
-    matvecs += GMRES( workspace, &workspace->H, 
-            workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
-    //t_elapsed = Get_Timing_Info ( t_start );
-
-    //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
-
-    //matvecs = GMRES_HouseHolder( workspace, workspace->H, 
-    //    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
-    //matvecs += GMRES_HouseHolder( workspace, workspace->H,  
-    //    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
-
-    //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, system );
-    //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, system );
-
-    //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
-    //      workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-    ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
-    //     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
-
-    //matvecs = CG( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += CG( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-
-    //matvecs = SDM( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += SDM( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-
-    //fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
-
-    data->timing.matvecs += matvecs;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-#endif
-
-    Calculate_Charges( system, workspace );
-    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
-    //   data->step, 
-    //   workspace->s[0][0], workspace->t[0][0], 
-    //   workspace->s[0][1], workspace->t[0][1], 
-    //   workspace->s[0][2], workspace->t[0][2] );
-    // if( data->step == control->nsteps )
-    //Print_Charges( system, control, workspace, data->step );
-}
diff --git a/PuReMD-GPU/src/QEq.h b/PuReMD-GPU/src/QEq.h
deleted file mode 100644
index 31dfbf61ba05ec79d32313c3ab648eb259f183f2..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/QEq.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __QEq_H_
-#define __QEq_H_
-
-#include "mytypes.h"
-
-
-void QEq( reax_system*, control_params*, simulation_data*, static_storage*,
-        list*, output_controls* );
-
-
-static inline HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
-{
-    sparse_matrix_entry temp = array[index1];
-    array[index1] = array[index2];
-    array[index2] = temp;
-}
-
-
-static inline HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
-{
-    int i = start;
-    int k = end; 
-
-    if (end - start >= 1)  
-    {  
-        int pivot = array[start].j;
-
-        while (k > i) 
-        {  
-            while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
-            while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
-            if (k > i) swap(array, i, k);
-        }  
-        swap(array, start, k);
-        quick_sort(array, start, k - 1);
-        quick_sort(array, k + 1, end);
-    }  
-}
-
-
-#endif
diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c
index 65f0eb2a872673259d508f17fc0da43530a7426f..bbae7dce50b8fdf60e5a0d8c459e50ca36dd67cb 100644
--- a/PuReMD-GPU/src/allocate.c
+++ b/PuReMD-GPU/src/allocate.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,103 +22,148 @@
 #include "allocate.h"
 
 #include "list.h"
+#include "tool_box.h"
+
+/* allocate space for atoms */
+int PreAllocate_Space( reax_system *system, control_params *control,
+        static_storage *workspace )
+{
+    int i;
+
+    system->atoms = (reax_atom*) scalloc( system->N,
+            sizeof(reax_atom), "atoms" );
+    workspace->orig_id = (int*) scalloc( system->N,
+            sizeof(int), "orid_id" );
+
+    /* space for keeping restriction info, if any */
+    if ( control->restrict_bonds )
+    {
+        workspace->restricted = (int*) scalloc( system->N,
+                sizeof(int), "restricted_atoms" );
+
+        workspace->restricted_list = (int*) scalloc( system->N,
+                sizeof(int), "restricted_list" );
+
+        workspace->restricted_list = (int*) scalloc( MAX_RESTRICT * system->N,
+                sizeof(int), "restricted_list[i]" );
+    }
+
+    return SUCCESS;
+}
 
 
 void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
     Delete_List( far_nbrs );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
+
+    if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
     {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-            num_intrs, far_nbrs->num_intrs );  
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+             num_intrs, far_nbrs->num_intrs );
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
+             num_intrs * sizeof(far_neighbor_data) / (1024 * 1024) );
 #endif
 }
 
 
-HOST int Allocate_Matrix( sparse_matrix *H, int n, int m )
+int Allocate_Matrix( sparse_matrix *pH, int n, int m )
 {
+    sparse_matrix *H;
+
+    if ( (pH = (sparse_matrix*) malloc(sizeof(sparse_matrix))) == NULL )
+    {
+        return FAILURE;
+    }
+
+    H = pH;
     H->n = n;
     H->m = m;
-    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-
-    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
 
-    if( (H->entries = 
-                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
-        return 0;
+    if ( (H->start = (unsigned int*) malloc(sizeof(int) * (n + 1))) == NULL
+            || (H->j = (unsigned int*) malloc(sizeof(int) * m)) == NULL
+            || (H->val = (real*) malloc(sizeof(real) * m)) == NULL )
+    {
+        return FAILURE;
+    }
 
-    return 1;
+    return SUCCESS;
 }
 
 
 void Deallocate_Matrix( sparse_matrix *H )
 {
     free(H->start);
-    free(H->entries);
-    free(H->end);
+    free(H->j);
+    free(H->val);
+    free(H);
 }
 
 
 int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 {
     Deallocate_Matrix( H );
-    if( !Allocate_Matrix( H, n, m ) ) {
+
+    if ( Allocate_Matrix( H, n, m ) == FAILURE )
+    {
         fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-        exit( 1 );
+        exit( INSUFFICIENT_MEMORY );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-            name, n, m );
-    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+             name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n",
+             name, m * sizeof(sparse_matrix_entry) / (1024 * 1024) );
 #endif
-    return 1;
+
+    return SUCCESS;
 }
 
 
-int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
-        list *hbonds )
+int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top,
+                         list *hbonds )
 {
     int i, num_hbonds;
 
     num_hbonds = 0;
     /* find starting indexes for each H and the total number of hbonds */
-    for( i = 1; i < n; ++i )
-        hb_top[i] += hb_top[i-1];
-    num_hbonds = hb_top[n-1];
+    for ( i = 1; i < n; ++i )
+    {
+        hb_top[i] += hb_top[i - 1];
+    }
+    num_hbonds = hb_top[n - 1];
 
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) )
+    if ( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) )
     {
         fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    for( i = 0; i < n; ++i )
-        if( h_index[i] == 0 ){
-            Set_Start_Index( 0, 0, hbonds ); 
-            Set_End_Index( 0, 0, hbonds ); 
+    for ( i = 0; i < n; ++i )
+    {
+        if ( h_index[i] == 0 )
+        {
+            Set_Start_Index( 0, 0, hbonds );
+            Set_End_Index( 0, 0, hbonds );
         }
-        else if( h_index[i] > 0 ){
-            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
-            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
+        else if ( h_index[i] > 0 )
+        {
+            Set_Start_Index( h_index[i], hb_top[i - 1], hbonds );
+            Set_End_Index( h_index[i], hb_top[i - 1], hbonds );
         }
+    }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
-    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-            num_hbonds * sizeof(hbond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: hbonds = %ldMB\n",
+             num_hbonds * sizeof(hbond_data) / (1024 * 1024) );
 #endif
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -129,10 +175,14 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating hbonds\n" );
 #endif
-    hb_top = (int *)calloc( n, sizeof(int) );
-    for( i = 0; i < n; ++i )
-        if( h_index[i] >= 0 )
-            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
+    hb_top = calloc( n, sizeof(int) );
+    for ( i = 0; i < n; ++i )
+    {
+        if ( h_index[i] >= 0 )
+        {
+            hb_top[i] = MAX(Num_Entries(h_index[i], hbonds) * SAFE_HBONDS, MIN_HBONDS);
+        }
+    }
 
     Delete_List( hbonds );
 
@@ -140,7 +190,7 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 
     free( hb_top );
 
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -150,29 +200,32 @@ int Allocate_Bond_List( int n, int *bond_top, list *bonds )
 
     num_bonds = 0;
     /* find starting indexes for each atom and the total number of bonds */
-    for( i = 1; i < n; ++i )
-        bond_top[i] += bond_top[i-1];
-    num_bonds = bond_top[n-1];
+    for ( i = 1; i < n; ++i )
+    {
+        bond_top[i] += bond_top[i - 1];
+    }
+    num_bonds = bond_top[n - 1];
 
-    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) )
+    if ( !Make_List(n, num_bonds, TYP_BOND, bonds ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    Set_Start_Index( 0, 0, bonds ); 
-    Set_End_Index( 0, 0, bonds ); 
-    for( i = 1; i < n; ++i ) {
-        Set_Start_Index( i, bond_top[i-1], bonds ); 
-        Set_End_Index( i, bond_top[i-1], bonds ); 
+    Set_Start_Index( 0, 0, bonds );
+    Set_End_Index( 0, 0, bonds );
+    for ( i = 1; i < n; ++i )
+    {
+        Set_Start_Index( i, bond_top[i - 1], bonds );
+        Set_End_Index( i, bond_top[i - 1], bonds );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n",
+             num_bonds * sizeof(bond_data) / (1024 * 1024) );
 #endif
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -184,9 +237,10 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating bonds\n" );
 #endif
-    bond_top = (int *)calloc( n, sizeof(int) );
+    bond_top = calloc( n, sizeof(int) );
     *est_3body = 0;
-    for( i = 0; i < n; ++i ){
+    for ( i = 0; i < n; ++i )
+    {
         *est_3body += SQR( Num_Entries( i, bonds ) );
         bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
     }
@@ -194,17 +248,18 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
     Delete_List( bonds );
 
     Allocate_Bond_List( n, bond_top, bonds );
-    *num_bonds = bond_top[n-1];
+    *num_bonds = bond_top[n - 1];
 
     free( bond_top );
 
-    return 1;
+    return SUCCESS;
 }
 
 
-void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
-        int nbr_flag )
+void Reallocate( reax_system *system, static_storage *workspace, list **lists,
+                 int nbr_flag )
 {
+    int i, j, k;
     int num_bonds, est_3body;
     reallocate_data *realloc;
     grid *g;
@@ -212,70 +267,75 @@ void Reallocate( reax_system *system, static_storage *workspace, list **lists,
     realloc = &(workspace->realloc);
     g = &(system->g);
 
-    if( realloc->num_far > 0 && nbr_flag ) {
-        fprintf (stderr, " Reallocating neighbors \n");
-        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
-                system->N, realloc->num_far * SAFE_ZONE );
+    if ( realloc->num_far > 0 && nbr_flag )
+    {
+        Reallocate_Neighbor_List( (*lists) + FAR_NBRS,
+                                  system->N, realloc->num_far * SAFE_ZONE );
         realloc->num_far = -1;
     }
 
-    if( realloc->Htop > 0 ){
-        fprintf (stderr, " Reallocating Matrix \n");
-        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+    if ( realloc->Htop > 0 )
+    {
+        Reallocate_Matrix(workspace->H, system->N, realloc->Htop * SAFE_ZONE, "H");
         realloc->Htop = -1;
 
-        Deallocate_Matrix( &workspace->L );
-        Deallocate_Matrix( &workspace->U );
+        Deallocate_Matrix( workspace->L );
+        Deallocate_Matrix( workspace->U );
+        workspace->L = NULL;
+        workspace->U = NULL;
     }
 
-    if( realloc->hbonds > 0 ){
-        fprintf (stderr, " Reallocating hbonds \n");
+    if ( realloc->hbonds > 0 )
+    {
         Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-                (*lists)+HBONDS );
+                               (*lists) + HBONDS );
         realloc->hbonds = -1;
     }
 
     num_bonds = est_3body = -1;
-    if( realloc->bonds > 0 ){
-        fprintf (stderr, " Reallocating bonds \n");
-        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
+    if ( realloc->bonds > 0 )
+    {
+        Reallocate_Bonds_List( system->N, (*lists) + BONDS, &num_bonds, &est_3body );
         realloc->bonds = -1;
         realloc->num_3body = MAX( realloc->num_3body, est_3body );
     }
 
-    if( realloc->num_3body > 0 ) {
-        fprintf (stderr, " Reallocating 3Body \n");
-        Delete_List( (*lists)+THREE_BODIES );
+    if ( realloc->num_3body > 0 )
+    {
+        Delete_List( (*lists) + THREE_BODIES );
 
-        if( num_bonds == -1 )
-            num_bonds = ((*lists)+BONDS)->num_intrs;
+        if ( num_bonds == -1 )
+            num_bonds = ((*lists) + BONDS)->num_intrs;
         realloc->num_3body *= SAFE_ZONE;
 
-        if( !Make_List( num_bonds, realloc->num_3body,
-                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) )
+        if ( !Make_List( num_bonds, realloc->num_3body,
+                         TYP_THREE_BODY, (*lists) + THREE_BODIES ) )
         {
             fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            exit( INIT_ERR );
+            exit( CANNOT_INITIALIZE );
         }
         realloc->num_3body = -1;
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "reallocating 3 bodies\n" );
         fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
         fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
-        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
-                realloc->num_3body*sizeof(three_body_interaction_data)/
-                (1024*1024) );
+        fprintf( stderr, "reallocated 3body memory: %ldMB\n",
+                 realloc->num_3body * sizeof(three_body_interaction_data) /
+                 (1024 * 1024) );
 #endif
     }
 
-    if( realloc->gcell_atoms > -1 ){
+    if ( realloc->gcell_atoms > -1 )
+    {
 #if defined(DEBUG_FOCUS)
         fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
 #endif
 
         free (g->atoms);
-        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-                sizeof (int) * workspace->realloc.gcell_atoms);
+        g->atoms = (int *) calloc( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof(int) * workspace->realloc.gcell_atoms );
+        realloc->gcell_atoms = -1;
+
         realloc->gcell_atoms = -1;
     }
 }
diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h
index b03ed80b34f153b9929ccaa80bc5c27fbf6ce540..72f724dac852c626e7c70018bba95c5f21dc5b51 100644
--- a/PuReMD-GPU/src/allocate.h
+++ b/PuReMD-GPU/src/allocate.h
@@ -28,9 +28,12 @@
 extern "C"  {
 #endif
 
+int PreAllocate_Space( reax_system*, control_params*, static_storage* );
+
 void Reallocate( reax_system*, static_storage*, list**, int );
 
 int Allocate_Matrix( sparse_matrix*, int, int );
+
 void Deallocate_Matrix( sparse_matrix *);
 
 int Allocate_HBond_List( int, int, int*, int*, list* );
diff --git a/PuReMD-GPU/src/analyze.c b/PuReMD-GPU/src/analyze.c
index 8eef938a372add29eb044846b12c77eddd47aad0..014eea8f46093e1a381e0a5e44241cfdc3d7b719 100644
--- a/PuReMD-GPU/src/analyze.c
+++ b/PuReMD-GPU/src/analyze.c
@@ -772,17 +772,16 @@ void Calculate_Drift( reax_system *system, control_params *control,
                                 &(system->box), driftvec );
 
             if ( fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 )
+                    fabs( driftvec[1] ) >= system->box.box_norms[1] / 2.0 - 2.0 ||
+                    fabs( driftvec[2] ) >= system->box.box_norms[2] / 2.0 - 2.0 )
             {
                 /* the atom has moved almost half the box size.
                    exclude it from further drift computations as it might have an
                    improper contribution due to periodic boudnaries. */
+                workspace->x_old[i][0] = -999999999.0;
+                workspace->x_old[i][1] = -999999999.0;
+                workspace->x_old[i][2] = -999999999.0;
 
-                //TODO -- check this one. may be not initializing this properly
-                //workspace->x_old[i][0] = workspace->x_old[i][2] = workspace->x_old[i][2] = -999999999999.0;
-                workspace->x_old[i][0] = workspace->x_old[i][2] = workspace->x_old[i][2] = -999999999.0;
-                //TODO -- check this one. may be not initializing this properly
                 continue;
             }
 
diff --git a/PuReMD-GPU/src/box.c b/PuReMD-GPU/src/box.c
index e42395c5556042493c0f707879772d84e9d18658..a7911fda49b1dccbbfccf24ae2fa445c608e0d5e 100644
--- a/PuReMD-GPU/src/box.c
+++ b/PuReMD-GPU/src/box.c
@@ -1,58 +1,189 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPu - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "box.h"
+
+#include "tool_box.h"
 #include "vector.h"
 
 
-void Init_Box_From_CRYST(real a, real b, real c, 
-        real alpha, real beta, real gamma, 
+void Make_Consistent( simulation_box* box )
+{
+    real one_vol;
+
+    box->volume =
+        box->box[0][0] * (box->box[1][1] * box->box[2][2] -
+                          box->box[2][1] * box->box[2][1]) +
+        box->box[0][1] * (box->box[2][0] * box->box[1][2] -
+                          box->box[1][0] * box->box[2][2]) +
+        box->box[0][2] * (box->box[1][0] * box->box[2][1] -
+                          box->box[2][0] * box->box[1][1]);
+
+    one_vol = 1.0 / box->volume;
+
+    box->box_inv[0][0] = (box->box[1][1] * box->box[2][2] -
+                          box->box[1][2] * box->box[2][1]) * one_vol;
+    box->box_inv[0][1] = (box->box[0][2] * box->box[2][1] -
+                          box->box[0][1] * box->box[2][2]) * one_vol;
+    box->box_inv[0][2] = (box->box[0][1] * box->box[1][2] -
+                          box->box[0][2] * box->box[1][1]) * one_vol;
+
+    box->box_inv[1][0] = (box->box[1][2] * box->box[2][0] -
+                          box->box[1][0] * box->box[2][2]) * one_vol;
+    box->box_inv[1][1] = (box->box[0][0] * box->box[2][2] -
+                          box->box[0][2] * box->box[2][0]) * one_vol;
+    box->box_inv[1][2] = (box->box[0][2] * box->box[1][0] -
+                          box->box[0][0] * box->box[1][2]) * one_vol;
+
+    box->box_inv[2][0] = (box->box[1][0] * box->box[2][1] -
+                          box->box[1][1] * box->box[2][0]) * one_vol;
+    box->box_inv[2][1] = (box->box[0][1] * box->box[2][0] -
+                          box->box[0][0] * box->box[2][1]) * one_vol;
+    box->box_inv[2][2] = (box->box[0][0] * box->box[1][1] -
+                          box->box[0][1] * box->box[1][0]) * one_vol;
+
+    box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
+                              SQR(box->box[0][1]) +
+                              SQR(box->box[0][2]) );
+    box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
+                              SQR(box->box[1][1]) +
+                              SQR(box->box[1][2]) );
+    box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
+                              SQR(box->box[2][1]) +
+                              SQR(box->box[2][2]) );
+
+    box->trans[0][0] = box->box[0][0] / box->box_norms[0];
+    box->trans[0][1] = box->box[1][0] / box->box_norms[0];
+    box->trans[0][2] = box->box[2][0] / box->box_norms[0];
+
+    box->trans[1][0] = box->box[0][1] / box->box_norms[1];
+    box->trans[1][1] = box->box[1][1] / box->box_norms[1];
+    box->trans[1][2] = box->box[2][1] / box->box_norms[1];
+
+    box->trans[2][0] = box->box[0][2] / box->box_norms[2];
+    box->trans[2][1] = box->box[1][2] / box->box_norms[2];
+    box->trans[2][2] = box->box[2][2] / box->box_norms[2];
+
+    one_vol = box->box_norms[0] * box->box_norms[1] * box->box_norms[2] * one_vol;
+
+    box->trans_inv[0][0] = (box->trans[1][1] * box->trans[2][2] -
+                            box->trans[1][2] * box->trans[2][1]) * one_vol;
+    box->trans_inv[0][1] = (box->trans[0][2] * box->trans[2][1] -
+                            box->trans[0][1] * box->trans[2][2]) * one_vol;
+    box->trans_inv[0][2] = (box->trans[0][1] * box->trans[1][2] -
+                            box->trans[0][2] * box->trans[1][1]) * one_vol;
+
+    box->trans_inv[1][0] = (box->trans[1][2] * box->trans[2][0] -
+                            box->trans[1][0] * box->trans[2][2]) * one_vol;
+    box->trans_inv[1][1] = (box->trans[0][0] * box->trans[2][2] -
+                            box->trans[0][2] * box->trans[2][0]) * one_vol;
+    box->trans_inv[1][2] = (box->trans[0][2] * box->trans[1][0] -
+                            box->trans[0][0] * box->trans[1][2]) * one_vol;
+
+    box->trans_inv[2][0] = (box->trans[1][0] * box->trans[2][1] -
+                            box->trans[1][1] * box->trans[2][0]) * one_vol;
+    box->trans_inv[2][1] = (box->trans[0][1] * box->trans[2][0] -
+                            box->trans[0][0] * box->trans[2][1]) * one_vol;
+    box->trans_inv[2][2] = (box->trans[0][0] * box->trans[1][1] -
+                            box->trans[0][1] * box->trans[1][0]) * one_vol;
+
+//   for (i=0; i < 3; i++)
+//     {
+//       for (j=0; j < 3; j++)
+//  fprintf(stderr,"%lf\t",box->trans[i][j]);
+//       fprintf(stderr,"\n");
+//     }
+//   fprintf(stderr,"\n");
+//   for (i=0; i < 3; i++)
+//     {
+//       for (j=0; j < 3; j++)
+//  fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
+//       fprintf(stderr,"\n");
+//     }
+
+    box->g[0][0] = box->box[0][0] * box->box[0][0] +
+                   box->box[0][1] * box->box[0][1] +
+                   box->box[0][2] * box->box[0][2];
+    box->g[1][0] =
+        box->g[0][1] = box->box[0][0] * box->box[1][0] +
+                       box->box[0][1] * box->box[1][1] +
+                       box->box[0][2] * box->box[1][2];
+    box->g[2][0] =
+        box->g[0][2] = box->box[0][0] * box->box[2][0] +
+                       box->box[0][1] * box->box[2][1] +
+                       box->box[0][2] * box->box[2][2];
+
+    box->g[1][1] = box->box[1][0] * box->box[1][0] +
+                   box->box[1][1] * box->box[1][1] +
+                   box->box[1][2] * box->box[1][2];
+    box->g[1][2] =
+        box->g[2][1] = box->box[1][0] * box->box[2][0] +
+                       box->box[1][1] * box->box[2][1] +
+                       box->box[1][2] * box->box[2][2];
+
+    box->g[2][2] = box->box[2][0] * box->box[2][0] +
+                   box->box[2][1] * box->box[2][1] +
+                   box->box[2][2] * box->box[2][2];
+
+    // These proportions are only used for isotropic_NPT!
+    box->side_prop[0] = box->box[0][0] / box->box[0][0];
+    box->side_prop[1] = box->box[1][1] / box->box[0][0];
+    box->side_prop[2] = box->box[2][2] / box->box[0][0];
+}
+
+
+/* setup the simulation box */
+void Setup_Box( real a, real b, real c, real alpha, real beta, real gamma,
         simulation_box* box )
 {
     double c_alpha, c_beta, c_gamma, s_gamma, zi;
 
-    c_alpha = cos(DEG2RAD(alpha));
-    c_beta  = cos(DEG2RAD(beta));
-    c_gamma = cos(DEG2RAD(gamma));
-    s_gamma = sin(DEG2RAD(gamma));
+    if ( IS_NAN_REAL(a) || IS_NAN_REAL(b) || IS_NAN_REAL(c)
+            || IS_NAN_REAL(alpha) || IS_NAN_REAL(beta) || IS_NAN_REAL(gamma) )
+    {
+        fprintf( stderr, "Invalid simulation box boundaries for big box (NaN). Terminating...\n" );
+        exit( INVALID_INPUT );
+    }
 
-    zi = (c_alpha - c_beta * c_gamma)/s_gamma; 
+    c_alpha = COS(DEG2RAD(alpha));
+    c_beta  = COS(DEG2RAD(beta));
+    c_gamma = COS(DEG2RAD(gamma));
+    s_gamma = SIN(DEG2RAD(gamma));
+    zi = (c_alpha - c_beta * c_gamma) / s_gamma;
 
-    box->box[0][0] = a; 
-    box->box[0][1] = 0.0; 
+    box->box[0][0] = a;
+    box->box[0][1] = 0.0;
     box->box[0][2] = 0.0;
-
-    box->box[1][0] = b * c_gamma; 
-    box->box[1][1] = b * s_gamma; 
-    box->box[1][2] = 0.0; 
-
+    box->box[1][0] = b * c_gamma;
+    box->box[1][1] = b * s_gamma;
+    box->box[1][2] = 0.0;
     box->box[2][0] = c * c_beta;
     box->box[2][1] = c * zi;
     box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi));
+#if defined(DEBUG)
+    fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n",
+             box->box[0][0], box->box[1][1], box->box[2][2] );
+#endif
 
     Make_Consistent( box );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", 
-            box->box[0][0], box->box[1][1], box->box[2][2] );
-#endif
 }
 
 
@@ -60,8 +191,8 @@ void Update_Box( rtensor box_tensor, simulation_box* box )
 {
     int i, j;
 
-    for (i=0; i < 3; i++)
-        for (j=0; j < 3; j++)
+    for (i = 0; i < 3; i++)
+        for (j = 0; j < 3; j++)
             box->box[i][j] = box_tensor[i][j];
 
     Make_Consistent( box );
@@ -70,200 +201,37 @@ void Update_Box( rtensor box_tensor, simulation_box* box )
 
 void Update_Box_Isotropic( simulation_box *box, real mu )
 {
-    /*box->box[0][0] = 
+    /*box->box[0][0] =
       POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-      box->box[1][1] = box->box[0][0] * box->side_prop[1];
-      box->box[2][2] = box->box[0][0] * box->side_prop[2]; 
-     */
+    box->box[1][1] = box->box[0][0] * box->side_prop[1];
+    box->box[2][2] = box->box[0][0] * box->side_prop[2];
+    */
     rtensor_Copy( box->old_box, box->box );
     box->box[0][0] *= mu;
     box->box[1][1] *= mu;
     box->box[2][2] *= mu;
 
-    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    box->volume = box->box[0][0] * box->box[1][1] * box->box[2][2];
     Make_Consistent(box/*, periodic*/);
 }
 
 
 void Update_Box_SemiIsotropic( simulation_box *box, rvec mu )
 {
-    /*box->box[0][0] = 
+    /*box->box[0][0] =
       POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-      box->box[1][1] = box->box[0][0] * box->side_prop[1];
-      box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
+    box->box[1][1] = box->box[0][0] * box->side_prop[1];
+    box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
     rtensor_Copy( box->old_box, box->box );
     box->box[0][0] *= mu[0];
     box->box[1][1] *= mu[1];
     box->box[2][2] *= mu[2];
 
-    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    box->volume = box->box[0][0] * box->box[1][1] * box->box[2][2];
     Make_Consistent(box);
 }
 
 
-void Make_Consistent(simulation_box* box)
-{
-    real one_vol;
-
-    box->volume = 
-        box->box[0][0] * (box->box[1][1]*box->box[2][2] - 
-                box->box[2][1]*box->box[2][1]) +
-        box->box[0][1] * (box->box[2][0]*box->box[1][2] -
-                box->box[1][0]*box->box[2][2]) +
-        box->box[0][2] * (box->box[1][0]*box->box[2][1] -
-                box->box[2][0]*box->box[1][1]);
-
-    one_vol = 1.0/box->volume;
-
-    box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] -
-            box->box[1][2]*box->box[2][1]) * one_vol;
-    box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] -
-            box->box[0][1]*box->box[2][2]) * one_vol;
-    box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] -
-            box->box[0][2]*box->box[1][1]) * one_vol;
-
-    box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] -
-            box->box[1][0]*box->box[2][2]) * one_vol;
-    box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] -
-            box->box[0][2]*box->box[2][0]) * one_vol;
-    box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] -
-            box->box[0][0]*box->box[1][2]) * one_vol;
-
-    box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] -
-            box->box[1][1]*box->box[2][0]) * one_vol;
-    box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] -
-            box->box[0][0]*box->box[2][1]) * one_vol;
-    box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] -
-            box->box[0][1]*box->box[1][0]) * one_vol;
-
-    box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
-            SQR(box->box[0][1]) +
-            SQR(box->box[0][2]) );
-    box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
-            SQR(box->box[1][1]) +
-            SQR(box->box[1][2]) );
-    box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
-            SQR(box->box[2][1]) +
-            SQR(box->box[2][2]) );
-
-    box->trans[0][0] = box->box[0][0]/box->box_norms[0]; 
-    box->trans[0][1] = box->box[1][0]/box->box_norms[0];
-    box->trans[0][2] = box->box[2][0]/box->box_norms[0];
-
-    box->trans[1][0] = box->box[0][1]/box->box_norms[1]; 
-    box->trans[1][1] = box->box[1][1]/box->box_norms[1];
-    box->trans[1][2] = box->box[2][1]/box->box_norms[1];
-
-    box->trans[2][0] = box->box[0][2]/box->box_norms[2]; 
-    box->trans[2][1] = box->box[1][2]/box->box_norms[2];
-    box->trans[2][2] = box->box[2][2]/box->box_norms[2];
-
-    one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol;
-
-    box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] -
-            box->trans[1][2]*box->trans[2][1]) * one_vol;
-    box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] -
-            box->trans[0][1]*box->trans[2][2]) * one_vol;
-    box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] -
-            box->trans[0][2]*box->trans[1][1]) * one_vol;
-
-    box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] -
-            box->trans[1][0]*box->trans[2][2]) * one_vol;
-    box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] -
-            box->trans[0][2]*box->trans[2][0]) * one_vol;
-    box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] -
-            box->trans[0][0]*box->trans[1][2]) * one_vol;
-
-    box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] -
-            box->trans[1][1]*box->trans[2][0]) * one_vol;
-    box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] -
-            box->trans[0][0]*box->trans[2][1]) * one_vol;
-    box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] -
-            box->trans[0][1]*box->trans[1][0]) * one_vol;
-
-    //   for (i=0; i < 3; i++)
-    //     {
-    //       for (j=0; j < 3; j++)
-    //     fprintf(stderr,"%lf\t",box->trans[i][j]);
-    //       fprintf(stderr,"\n");
-    //     }
-    //   fprintf(stderr,"\n");
-    //   for (i=0; i < 3; i++)
-    //     {
-    //       for (j=0; j < 3; j++)
-    //     fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
-    //       fprintf(stderr,"\n");
-    //     }
-
-
-    box->g[0][0] = box->box[0][0] * box->box[0][0] +
-        box->box[0][1] * box->box[0][1] +
-        box->box[0][2] * box->box[0][2];
-    box->g[1][0] = 
-        box->g[0][1] = box->box[0][0] * box->box[1][0] +
-        box->box[0][1] * box->box[1][1] +
-        box->box[0][2] * box->box[1][2];
-    box->g[2][0] =
-        box->g[0][2] = box->box[0][0] * box->box[2][0] +
-        box->box[0][1] * box->box[2][1] +
-        box->box[0][2] * box->box[2][2];
-
-    box->g[1][1] = box->box[1][0] * box->box[1][0] +
-        box->box[1][1] * box->box[1][1] +
-        box->box[1][2] * box->box[1][2];
-    box->g[1][2] =
-        box->g[2][1] = box->box[1][0] * box->box[2][0] +
-        box->box[1][1] * box->box[2][1] +
-        box->box[1][2] * box->box[2][2];
-
-    box->g[2][2] = box->box[2][0] * box->box[2][0] +
-        box->box[2][1] * box->box[2][1] +
-        box->box[2][2] * box->box[2][2];
-
-    // These proportions are only used for isotropic_NPT!
-    box->side_prop[0] = box->box[0][0] / box->box[0][0];
-    box->side_prop[1] = box->box[1][1] / box->box[0][0];
-    box->side_prop[2] = box->box[2][2] / box->box[0][0];
-}
-
-
-void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    int i, j;
-    real tmp;
-
-    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
-
-    if (flag > 0) {
-        for (i=0; i < 3; i++) {
-            tmp = 0.0;
-            for (j=0; j < 3; j++)
-                tmp += box->trans[i][j]*x1[j]; 
-            x2[i] = tmp;
-        }
-    }
-    else {
-        for (i=0; i < 3; i++) {
-            tmp = 0.0;
-            for (j=0; j < 3; j++)
-                tmp += box->trans_inv[i][j]*x1[j]; 
-            x2[i] = tmp;
-        }
-    }
-    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);  
-}
-
-
-void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    Transform( x1, box, flag, x2 );
-
-    x2[0] /= box->box_norms[0];
-    x2[1] /= box->box_norms[1];
-    x2[2] /= box->box_norms[2];
-}
-
-
 void Distance_on_T3_Gen( rvec x1, rvec x2, simulation_box* box, rvec r )
 {
     rvec xa, xb, ra;
@@ -301,12 +269,12 @@ void Inc_on_T3_Gen( rvec x, rvec dx, simulation_box* box )
 real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 {
     int i, j;
-    real dist=0.0, tmp;
+    real dist = 0.0, tmp;
 
-    for( i = 0; i < 3; i++ )
+    for ( i = 0; i < 3; i++ )
     {
         tmp = 0.0;
-        for( j = 0; j < 3; j++ )
+        for ( j = 0; j < 3; j++ )
             tmp += box->g[i][j] * x2[j];
         dist += x1[i] * tmp;
     }
@@ -315,12 +283,59 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 }
 
 
-/* Determines if the distance between x1 and x2 is < vlist_cut. 
+int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
+                       real cutoff, far_neighbor_data *data )
+{
+    real norm_sqr, d, tmp;
+    int i;
+
+    norm_sqr = 0;
+
+    for ( i = 0; i < 3; i++ )
+    {
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+
+        if ( tmp >= SQR( box->box_norms[i] / 2.0 ) )
+        {
+            if ( x2[i] > x1[i] )
+            {
+                d -= box->box_norms[i];
+                data->rel_box[i] = -1;
+            }
+            else
+            {
+                d += box->box_norms[i];
+                data->rel_box[i] = +1;
+            }
+
+            data->dvec[i] = d;
+            norm_sqr += SQR(d);
+        }
+        else
+        {
+            data->dvec[i] = d;
+            norm_sqr += tmp;
+            data->rel_box[i] = 0;
+        }
+    }
+
+    if ( norm_sqr <= SQR(cutoff) )
+    {
+        data->d = sqrt(norm_sqr);
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+
+/* Determines if the distance between x1 and x2 is < vlist_cut.
    If so, this neighborhood is added to the list of far neighbors.
    Periodic boundary conditions do not apply. */
-void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-        control_params *control, 
-        far_neighbor_data *new_nbrs, int *count )
+void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
+                                    control_params *control,
+                                    far_neighbor_data *new_nbrs, int *count )
 {
     real norm_sqr;
 
@@ -328,7 +343,8 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
 
     norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec );
 
-    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+    if ( norm_sqr <= SQR( control->vlist_cut ) )
+    {
         *count = 1;
         new_nbrs[0].d = SQRT( norm_sqr );
 
@@ -341,11 +357,11 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
 
 /* Finds periodic neighbors in a 'big_box'. Here 'big_box' means:
    the current simulation box has all dimensions > 2 *vlist_cut.
-   If the periodic distance between x1 and x2 is than vlist_cut, this 
+   If the periodic distance between x1 and x2 is than vlist_cut, this
    neighborhood is added to the list of far neighbors. */
-void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box, 
-        control_params *control, 
-        far_neighbor_data *periodic_nbrs, 
+void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
+        control_params *control,
+        far_neighbor_data *periodic_nbrs,
         int *count )
 {
     real norm_sqr, d, tmp;
@@ -353,19 +369,23 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
 
     norm_sqr = 0;
 
-    for( i = 0; i < 3; i++ ) {
+    for ( i = 0; i < 3; i++ )
+    {
         d = x2[i] - x1[i];
         tmp = SQR(d);
         // fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n",
         // i,tmp,SQR(box->box_norms[i]/2.0));
 
-        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-            if( x2[i] > x1[i] ) {
+        if ( tmp >= SQR( box->box_norms[i] / 2.0 ) )
+        {
+            if ( x2[i] > x1[i] )
+            {
                 d -= box->box_norms[i];
                 periodic_nbrs[0].rel_box[i] = -1;
                 // periodic_nbrs[0].ext_factor[i] = +1;
             }
-            else {
+            else
+            {
                 d += box->box_norms[i];
                 periodic_nbrs[0].rel_box[i] = +1;
                 // periodic_nbrs[0].ext_factor[i] = -1;
@@ -374,15 +394,17 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
             periodic_nbrs[0].dvec[i] = d;
             norm_sqr += SQR(d);
         }
-        else {
+        else
+        {
             periodic_nbrs[0].dvec[i] = d;
             norm_sqr += tmp;
             periodic_nbrs[0].rel_box[i]   = 0;
             // periodic_nbrs[0].ext_factor[i] = 0;
-        } 
+        }
     }
 
-    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+    if ( norm_sqr <= SQR( control->vlist_cut ) )
+    {
         *count = 1;
         periodic_nbrs[0].d = SQRT( norm_sqr );
     }
@@ -390,16 +412,16 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
 }
 
 
-/* Finds all periodic far neighborhoods between x1 and x2 
+/* Finds all periodic far neighborhoods between x1 and x2
    ((dist(x1, x2') < vlist_cut, periodic images of x2 are also considered).
    Here the box is 'small' meaning that at least one dimension is < 2*vlist_cut.
-IMPORTANT: This part might need some improvement. In NPT, the simulation box 
-might get too small (such as <5 A!). In this case we have to consider the 
-periodic images of x2 that are two boxs away!!!
- */
+   IMPORTANT: This part might need some improvement. In NPT, the simulation box
+   might get too small (such as <5 A!). In this case we have to consider the
+   periodic images of x2 that are two boxs away!!!
+*/
 void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box,
-        control_params *control, 
-        far_neighbor_data *periodic_nbrs, 
+        control_params *control,
+        far_neighbor_data *periodic_nbrs,
         int *count )
 {
     int i, j, k;
@@ -418,14 +440,18 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
       imax, jmax, kmax ); */
 
 
-    for( i = -imax; i <= imax; ++i )
-        if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) {
-            for( j = -jmax; j <= jmax; ++j )
-                if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) {
-                    for( k = -kmax; k <= kmax; ++k )
-                        if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) {
+    for ( i = -imax; i <= imax; ++i )
+        if (fabs(d_i = ((x2[0] + i * box->box_norms[0]) - x1[0])) <= control->vlist_cut)
+        {
+            for ( j = -jmax; j <= jmax; ++j )
+                if (fabs(d_j = ((x2[1] + j * box->box_norms[1]) - x1[1])) <= control->vlist_cut)
+                {
+                    for ( k = -kmax; k <= kmax; ++k )
+                        if (fabs(d_k = ((x2[2] + k * box->box_norms[2]) - x1[2])) <= control->vlist_cut)
+                        {
                             sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k);
-                            if( sqr_norm <= SQR(control->vlist_cut) ) {
+                            if ( sqr_norm <= SQR(control->vlist_cut) )
+                            {
                                 periodic_nbrs[ *count ].d = SQRT( sqr_norm );
 
                                 periodic_nbrs[ *count ].dvec[0] = d_i;
@@ -466,21 +492,21 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
 
 /* Returns the mapping for the neighbor box pointed by (ix,iy,iz) */
 /*int Get_Nbr_Box( simulation_box *box, int ix, int iy, int iz )
-  {
-  return (9 * ix + 3 * iy + iz + 13);  
-// 13 is to handle negative indexes properly
+{
+  return (9 * ix + 3 * iy + iz + 13);
+  // 13 is to handle negative indexes properly
 }*/
 
 
 /* Returns total pressure vector for the neighbor box pointed by (ix,iy,iz) */
 /*rvec Get_Nbr_Box_Press( simulation_box *box, int ix, int iy, int iz )
-  {
+{
   int map;
 
-  map = 9 * ix + 3 * iy + iz + 13;  
-// 13 is to adjust -1,-1,-1 correspond to index 0
+  map = 9 * ix + 3 * iy + iz + 13;
+  // 13 is to adjust -1,-1,-1 correspond to index 0
 
-return box->nbr_box_press[map];
+  return box->nbr_box_press[map];
 }*/
 
 
@@ -489,53 +515,53 @@ return box->nbr_box_press[map];
   {
   int map;
 
-  map = 9 * ix + 3 * iy + iz + 13;  
-// 13 is to adjust -1,-1,-1 correspond to index 0
+  map = 9 * ix + 3 * iy + iz + 13;
+  // 13 is to adjust -1,-1,-1 correspond to index 0
 
-rvec_Add( box->nbr_box_press[map], v );
+  rvec_Add( box->nbr_box_press[map], v );
 }*/
 
 
 /* Increments the total pressure vector for the neighbor box mapped to 'map' */
 /*void Inc_Nbr_Box_Press( simulation_box *box, int map, rvec v )
-  {
+{
   rvec_Add( box->nbr_box_press[map], v );
-  }*/
+}*/
 
 
-void Print_Box_Information( simulation_box* box, FILE *out )
+void Print_Box( simulation_box* box, FILE *out )
 {
     int i, j;
 
     fprintf( out, "box: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->box[i][j] );
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
 
-    fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", 
-            box->volume, 
-            box->box_norms[0], box->box_norms[1], box->box_norms[2] );
+    fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n",
+             box->volume,
+             box->box_norms[0], box->box_norms[1], box->box_norms[2] );
 
     fprintf( out, "box_trans: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->trans[i][j] );
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
 
     fprintf( out, "box_trinv: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->trans_inv[i][j] );
         fprintf( out, "}" );
     }
diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h
index 418aa6208a81fb05ee56ff09afc6ff76751f75c9..84f8371becafbc86180e75b7b8a189c299c82493 100644
--- a/PuReMD-GPU/src/box.h
+++ b/PuReMD-GPU/src/box.h
@@ -25,9 +25,7 @@
 #include "mytypes.h"
 
 
-/* Initializes box from CRYST1 line of PDB */
-void Init_Box_From_CRYST(real, real, real, real, real, real,
-        simulation_box*/*, int*/);
+void Setup_Box( real, real, real, real, real, real, simulation_box* );
 
 /* Initializes box from box rtensor */
 void Update_Box(rtensor, simulation_box* /*, int*/);
@@ -36,14 +34,9 @@ void Update_Box_SemiIsotropic( simulation_box*, rvec /*, int*/ );
 
 /* Computes all the transformations,
    metric and other quantities from box rtensor */
-void Make_Consistent(simulation_box*/*, int*/ );
-
-/* Applies transformation to and from
-   Cartesian to Triclinic coordinates based on flag */
-/* Use -1 flag for Cartesian -> Triclinic and +1 for otherway */
-void Transform( rvec, simulation_box*, char, rvec );
-void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
+void Make_Consistent( simulation_box* );
 
+int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
 void Get_NonPeriodic_Far_Neighbors( rvec, rvec, simulation_box*,
         control_params*, far_neighbor_data*, int* );
 void Get_Periodic_Far_Neighbors_Big_Box( rvec, rvec, simulation_box*,
@@ -59,10 +52,6 @@ void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );*/
 
 /* These functions assume that the coordinates are in triclinic system */
 /* this function returns cartesian norm but triclinic distance vector */
-real Metric_Product( rvec, rvec, simulation_box* );
-
-void Print_Box_Information( simulation_box*, FILE* );
-
 static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
 {
 
@@ -113,5 +102,9 @@ static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
     }
 }
 
+real Metric_Product( rvec, rvec, simulation_box* );
+
+void Print_Box( simulation_box*, FILE* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/control.c b/PuReMD-GPU/src/control.c
new file mode 100644
index 0000000000000000000000000000000000000000..41f744969f1615ba621d85db852d998e92719b86
--- /dev/null
+++ b/PuReMD-GPU/src/control.c
@@ -0,0 +1,560 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include <ctype.h>
+
+#include "control.h"
+#include "traj.h"
+#include "tool_box.h"
+
+
+char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
+        output_controls *out_control )
+{
+    char *s, **tmp;
+    int c, i;
+    real val;
+    int ival;
+
+    /* assign default values */
+    strcpy( control->sim_name, "default.sim" );
+
+    control->restart = 0;
+    out_control->restart_format = WRITE_BINARY;
+    out_control->restart_freq = 0;
+    strcpy( control->restart_from, "default.res" );
+    out_control->restart_freq = 0;
+    control->random_vel = 0;
+
+    control->reposition_atoms = 0;
+
+    control->ensemble = NVE;
+    control->nsteps = 0;
+    control->dt = 0.25;
+
+    control->geo_format = PDB;
+    control->restrict_bonds = 0;
+
+    control->periodic_boundaries = 1;
+    control->periodic_images[0] = 0;
+    control->periodic_images[1] = 0;
+    control->periodic_images[2] = 0;
+
+    control->reneighbor = 1;
+    control->vlist_cut = 0;
+    control->nbr_cut = 4.;
+    control->r_cut = 10.;
+    control->r_sp_cut = 10.;
+    control->max_far_nbrs = 1000;
+    control->bo_cut = 0.01;
+    control->thb_cut = 0.001;
+    control->hb_cut = 7.50;
+
+    control->tabulate = 0;
+
+    control->qeq_solver_type = GMRES_S;
+    control->qeq_solver_q_err = 0.000001;
+    control->qeq_domain_sparsify_enabled = FALSE;
+    control->qeq_domain_sparsity = 1.0;
+    control->pre_comp_type = ICHOLT_PC;
+    control->pre_comp_sweeps = 3;
+    control->pre_comp_refactor = 100;
+    control->pre_comp_droptol = 0.01;
+    control->pre_app_type = TRI_SOLVE_PA;
+    control->pre_app_jacobi_iters = 50;
+
+    control->T_init = 0.;
+    control->T_final = 300.;
+    control->Tau_T = 1.0;
+    control->T_mode = 0.;
+    control->T_rate = 1.;
+    control->T_freq = 1.;
+
+    control->P[0] = 0.000101325;
+    control->P[1] = 0.000101325;
+    control->P[2] = 0.000101325;
+    control->Tau_P[0]  = 500.0;
+    control->Tau_P[1]  = 500.0;
+    control->Tau_P[2]  = 500.0;
+    control->Tau_PT = 500.0;
+    control->compressibility = 1.0;
+    control->press_mode = 0;
+
+    control->remove_CoM_vel = 25;
+
+    out_control->debug_level = 0;
+    out_control->energy_update_freq = 10;
+
+    out_control->write_steps = 100;
+    out_control->traj_compress = 0;
+    out_control->write = fprintf;
+    out_control->traj_format = 0;
+    out_control->write_header =
+        (int (*)( reax_system*, control_params*,
+                  static_storage*, void* )) Write_Custom_Header;
+    out_control->append_traj_frame =
+        (int (*)( reax_system*, control_params*, simulation_data*,
+                  static_storage*, list **, void* )) Append_Custom_Frame;
+
+    strcpy( out_control->traj_title, "default_title" );
+    out_control->atom_format = 0;
+    out_control->bond_info = 0;
+    out_control->angle_info = 0;
+
+    control->molec_anal = NO_ANALYSIS;
+    control->freq_molec_anal = 0;
+    control->bg_cut = 0.3;
+    control->num_ignored = 0;
+    memset( control->ignore, 0, sizeof(int)*MAX_ATOM_TYPES );
+
+    control->dipole_anal = 0;
+    control->freq_dipole_anal = 0;
+
+    control->diffusion_coef = 0;
+    control->freq_diffusion_coef = 0;
+    control->restrict_type = 0;
+
+    /* memory allocations */
+    s = (char*) malloc(sizeof(char) * MAX_LINE);
+    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    for (i = 0; i < MAX_TOKENS; i++)
+        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
+
+    /* read control parameters file */
+    while (fgets(s, MAX_LINE, fp))
+    {
+        c = Tokenize(s, &tmp);
+
+        if ( strcmp(tmp[0], "simulation_name") == 0 )
+        {
+            strcpy( control->sim_name, tmp[1] );
+        }
+        //else if( strcmp(tmp[0], "restart") == 0 ) {
+        //  ival = atoi(tmp[1]);
+        //  control->restart = ival;
+        //}
+        else if ( strcmp(tmp[0], "restart_format") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->restart_format = ival;
+        }
+        else if ( strcmp(tmp[0], "restart_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->restart_freq = ival;
+        }
+        else if ( strcmp(tmp[0], "random_vel") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->random_vel = ival;
+        }
+        else if ( strcmp(tmp[0], "reposition_atoms") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->reposition_atoms = ival;
+        }
+        else if ( strcmp(tmp[0], "ensemble_type") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->ensemble = ival;
+        }
+        else if ( strcmp(tmp[0], "nsteps") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->nsteps = ival;
+        }
+        else if ( strcmp(tmp[0], "dt") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->dt = val * 1.e-3;  // convert dt from fs to ps!
+        }
+        else if ( strcmp(tmp[0], "periodic_boundaries") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->periodic_boundaries = ival;
+        }
+        else if ( strcmp(tmp[0], "periodic_images") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->periodic_images[0] = ival;
+            ival = atoi(tmp[2]);
+            control->periodic_images[1] = ival;
+            ival = atoi(tmp[3]);
+            control->periodic_images[2] = ival;
+        }
+        else if ( strcmp(tmp[0], "geo_format") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->geo_format = ival;
+        }
+        else if ( strcmp(tmp[0], "restrict_bonds") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->restrict_bonds = ival;
+        }
+        else if ( strcmp(tmp[0], "tabulate_long_range") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->tabulate = ival;
+        }
+        else if ( strcmp(tmp[0], "reneighbor") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->reneighbor = ival;
+        }
+        else if ( strcmp(tmp[0], "vlist_buffer") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->vlist_cut = val;
+        }
+        else if ( strcmp(tmp[0], "nbrhood_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->nbr_cut = val;
+        }
+        else if ( strcmp(tmp[0], "thb_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->thb_cut = val;
+        }
+        else if ( strcmp(tmp[0], "hbond_cutoff") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->hb_cut = val;
+        }
+        else if ( strcmp(tmp[0], "qeq_solver_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->qeq_solver_type = ival;
+        }
+        else if ( strcmp(tmp[0], "qeq_solver_q_err") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->qeq_solver_q_err = val;
+        }
+        else if ( strcmp(tmp[0], "qeq_domain_sparsity") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->qeq_domain_sparsity = val;
+            control->qeq_domain_sparsify_enabled = TRUE;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_type = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_refactor") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_refactor = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_droptol") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->pre_comp_droptol = val;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_sweeps") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_sweeps = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_app_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_app_type = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_app_jacobi_iters = ival;
+        }
+        else if ( strcmp(tmp[0], "temp_init") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_init = val;
+
+            if ( control->T_init < 0.001 )
+                control->T_init = 0.001;
+        }
+        else if ( strcmp(tmp[0], "temp_final") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_final = val;
+
+            if ( control->T_final < 0.1 )
+                control->T_final = 0.1;
+        }
+        else if ( strcmp(tmp[0], "t_mass") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->Tau_T = val * 1.e-3;    // convert t_mass from fs to ps
+        }
+        else if ( strcmp(tmp[0], "t_mode") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->T_mode = ival;
+        }
+        else if ( strcmp(tmp[0], "t_rate") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_rate = val;
+        }
+        else if ( strcmp(tmp[0], "t_freq") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_freq = val;
+        }
+        else if ( strcmp(tmp[0], "pressure") == 0 )
+        {
+            if ( control->ensemble == iNPT )
+            {
+                val = atof(tmp[1]);
+                control->P[0] = control->P[1] = control->P[2] = val;
+            }
+            else if ( control->ensemble == sNPT )
+            {
+                val = atof(tmp[1]);
+                control->P[0] = val;
+
+                val = atof(tmp[2]);
+                control->P[1] = val;
+
+                val = atof(tmp[3]);
+                control->P[2] = val;
+            }
+        }
+        else if ( strcmp(tmp[0], "p_mass") == 0 )
+        {
+            if ( control->ensemble == iNPT )
+            {
+                val = atof(tmp[1]);
+                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
+            }
+            else if ( control->ensemble == sNPT )
+            {
+                val = atof(tmp[1]);
+                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
+
+                val = atof(tmp[2]);
+                control->Tau_P[1] = val * 1.e-3;   // convert p_mass from fs to ps
+
+                val = atof(tmp[3]);
+                control->Tau_P[2] = val * 1.e-3;   // convert p_mass from fs to ps
+            }
+        }
+        else if ( strcmp(tmp[0], "pt_mass") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->Tau_PT = val * 1.e-3;  // convert pt_mass from fs to ps
+        }
+        else if ( strcmp(tmp[0], "compress") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->compressibility = val;
+        }
+        else if ( strcmp(tmp[0], "press_mode") == 0 )
+        {
+            val = atoi(tmp[1]);
+            control->press_mode = val;
+        }
+        else if ( strcmp(tmp[0], "remove_CoM_vel") == 0 )
+        {
+            val = atoi(tmp[1]);
+            control->remove_CoM_vel = val;
+        }
+        else if ( strcmp(tmp[0], "debug_level") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->debug_level = ival;
+        }
+        else if ( strcmp(tmp[0], "energy_update_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->energy_update_freq = ival;
+        }
+        else if ( strcmp(tmp[0], "write_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->write_steps = ival;
+        }
+        else if ( strcmp(tmp[0], "traj_compress") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->traj_compress = ival;
+
+            if ( out_control->traj_compress )
+                out_control->write = (int (*)(FILE *, const char *, ...)) gzprintf;
+            else out_control->write = fprintf;
+        }
+        else if ( strcmp(tmp[0], "traj_format") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->traj_format = ival;
+
+            if ( out_control->traj_format == 0 )
+            {
+                out_control->write_header =
+                    (int (*)( reax_system*, control_params*,
+                              static_storage*, void* )) Write_Custom_Header;
+                out_control->append_traj_frame =
+                    (int (*)(reax_system*, control_params*, simulation_data*,
+                             static_storage*, list **, void*)) Append_Custom_Frame;
+            }
+            else if ( out_control->traj_format == 1 )
+            {
+                out_control->write_header =
+                    (int (*)( reax_system*, control_params*,
+                              static_storage*, void* )) Write_xyz_Header;
+                out_control->append_traj_frame =
+                    (int (*)( reax_system*,  control_params*, simulation_data*,
+                              static_storage*, list **, void* )) Append_xyz_Frame;
+            }
+        }
+        else if ( strcmp(tmp[0], "traj_title") == 0 )
+        {
+            strcpy( out_control->traj_title, tmp[1] );
+        }
+        else if ( strcmp(tmp[0], "atom_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 4;
+        }
+        else if ( strcmp(tmp[0], "atom_velocities") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 2;
+        }
+        else if ( strcmp(tmp[0], "atom_forces") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 1;
+        }
+        else if ( strcmp(tmp[0], "bond_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->bond_info = ival;
+        }
+        else if ( strcmp(tmp[0], "angle_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->angle_info = ival;
+        }
+        else if ( strcmp(tmp[0], "test_forces") == 0 )
+        {
+            ival = atoi(tmp[1]);
+        }
+        else if ( strcmp(tmp[0], "molec_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->molec_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_molec_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_molec_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "bond_graph_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->bg_cut = val;
+        }
+        else if ( strcmp(tmp[0], "ignore") == 0 )
+        {
+            control->num_ignored = atoi(tmp[1]);
+            for ( i = 0; i < control->num_ignored; ++i )
+                control->ignore[atoi(tmp[i + 2])] = 1;
+        }
+        else if ( strcmp(tmp[0], "dipole_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->dipole_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_dipole_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_dipole_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "diffusion_coef") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->diffusion_coef = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_diffusion_coef") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_diffusion_coef = ival;
+        }
+        else if ( strcmp(tmp[0], "restrict_type") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->restrict_type = ival;
+        }
+        else
+        {
+            fprintf( stderr, "WARNING: unknown parameter %s\n", tmp[0] );
+            exit( UNKNOWN_OPTION );
+        }
+    }
+
+    if (ferror(fp))
+    {
+        fprintf(stderr, "Error reading control file. Terminating.\n");
+        exit( INVALID_INPUT );
+    }
+
+    /* determine target T */
+    if ( control->T_mode == 0 )
+        control->T = control->T_final;
+    else control->T = control->T_init;
+
+
+    /* near neighbor and far neighbor cutoffs */
+    control->bo_cut = 0.01 * system->reaxprm.gp.l[29];
+    control->r_low  = system->reaxprm.gp.l[11];
+    control->r_cut  = system->reaxprm.gp.l[12];
+    control->r_sp_cut  = control->r_cut * control->qeq_domain_sparsity;
+    control->vlist_cut += control->r_cut;
+
+    system->g.cell_size = control->vlist_cut / 2.;
+    for ( i = 0; i < 3; ++i )
+    {
+        system->g.spread[i] = 2;
+    }
+
+    /* free memory allocations at the top */
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        free( tmp[i] );
+    }
+    free( tmp );
+    free( s );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr,
+             "en=%d steps=%d dt=%.5f opt=%d T=%.5f P=%.5f %.5f %.5f\n",
+             control->ensemble, control->nsteps, control->dt, control->tabulate,
+             control->T, control->P[0], control->P[1], control->P[2] );
+
+    fprintf(stderr, "control file read\n" );
+#endif
+
+    return SUCCESS;
+}
diff --git a/PuReMD-GPU/src/control.h b/PuReMD-GPU/src/control.h
new file mode 100644
index 0000000000000000000000000000000000000000..66d0dde7b4901d7a7b42512414328a8e6b256d83
--- /dev/null
+++ b/PuReMD-GPU/src/control.h
@@ -0,0 +1,29 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CONTROL_H_
+#define __CONTROL_H_
+
+#include "mytypes.h"
+
+char Read_Control_File( FILE*, reax_system*, control_params*, output_controls* );
+
+#endif
diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu
index bf277b391ce0df0c5336ea0a0653b6863ca14fec..754668c9d9be6601aa9e9a649eb920899e39833c 100644
--- a/PuReMD-GPU/src/cuda_forces.cu
+++ b/PuReMD-GPU/src/cuda_forces.cu
@@ -36,10 +36,10 @@
 #include "cuda_three_body_interactions.h"
 #include "cuda_four_body_interactions.h"
 #include "cuda_list.h"
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 #include "cuda_reduction.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 #include "cudaProfiler.h"
 
diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu
index 1a205506e4c5ff767e02398a3859f838818c1e1a..f0252a2f564a035d7317ea0698712318ee96df5e 100644
--- a/PuReMD-GPU/src/cuda_init_md.cu
+++ b/PuReMD-GPU/src/cuda_init_md.cu
@@ -49,7 +49,7 @@
 #include "cuda_reduction.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 void Cuda_Init_System( reax_system *system, control_params *control, 
@@ -116,7 +116,9 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
     Reset_Simulation_Data( data );
 
     if( !control->restart )  
+    {
         data->step = data->prev_steps = 0;
+    }
 
     switch( control->ensemble ) {
         case NVE:
@@ -124,16 +126,18 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Cuda_Velocity_Verlet_NVE;
             break;
 
-
         case NVT:
             data->N_f = 3 * system->N + 1;
             //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
+
+            if( !control->restart || (control->restart && control->random_vel) )
+            {
                 data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
                         data->N_f * K_B * control->T );
                 data->therm.v_xi = data->therm.G_xi * control->dt;
                 data->therm.v_xi_old = 0;
                 data->therm.xi = 0;
+
 #if defined(DEBUG_FOCUS)
                 fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
                         data->therm.G_xi, control->Tau_T, data->E_Kin, 
@@ -144,12 +148,13 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
             break;
 
-
         case NPT: // Anisotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
             data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
+
+            if( !control->restart )
+            {
                 data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
                         data->N_f * K_B * control->T );
                 data->therm.v_xi = data->therm.G_xi * control->dt;
@@ -160,7 +165,6 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
             break;
 
-
         case sNPT: // Semi-Isotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
@@ -168,7 +172,6 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
             break;
 
-
         case iNPT: // Isotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu
index cba0b79c39b4f9b66e5b506d11dcffb81adc488d..ab4d203139e4a8235d9035ab566fbdca7ded5c82 100644
--- a/PuReMD-GPU/src/cuda_integrate.cu
+++ b/PuReMD-GPU/src/cuda_integrate.cu
@@ -36,10 +36,10 @@
 #include "cuda_forces.h"
 #include "cuda_grid.h"
 #include "cuda_neighbors.h"
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, 
diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu
index 876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8..5cfe03dea5f5c314a7dfb60ef3d4df2e6c8f61ae 100644
--- a/PuReMD-GPU/src/cuda_neighbors.cu
+++ b/PuReMD-GPU/src/cuda_neighbors.cu
@@ -265,11 +265,10 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
             nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
             max = top [index_grid_3d(x, y, z, &g)];
 
-            for (m = 0; m < max; m++) {
+            for (m = 0; m < max; m++)
+            {
                 atom2 = nbr_atoms[m];
 
-                //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
-
                 //CHANGE ORIGINAL
                 /*
                    if (atom1 > atom2) {
diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_qeq.cu
similarity index 99%
rename from PuReMD-GPU/src/cuda_QEq.cu
rename to PuReMD-GPU/src/cuda_qeq.cu
index 033945338aa76aa4c02909f90d2ca3eb1dacad58..21c2e334be0d6ec46d0a42f4c4f8a44dfc0ebffc 100644
--- a/PuReMD-GPU/src/cuda_QEq.cu
+++ b/PuReMD-GPU/src/cuda_qeq.cu
@@ -18,14 +18,15 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 
-#include "QEq.h"
+#include "qeq.h"
 #include "allocate.h"
 #include "lin_alg.h"
 #include "list.h"
 #include "print_utils.h"
 #include "index_utils.h"
+#include "sort.h"
 #include "system_props.h"
 
 #include "cuda_copy.h"
@@ -33,9 +34,7 @@
 #include "cuda_utils.h"
 #include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
-
-#include "sort.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A )
diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_qeq.h
similarity index 100%
rename from PuReMD-GPU/src/cuda_QEq.h
rename to PuReMD-GPU/src/cuda_qeq.h
diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu
index 1efcf28aa432f563749e49c68c67cc6b132e711e..6867857a4d58771f571fbb6810efbf926677f80f 100644
--- a/PuReMD-GPU/src/cuda_utils.cu
+++ b/PuReMD-GPU/src/cuda_utils.cu
@@ -29,7 +29,7 @@ cusparseMatDescr_t matdescriptor;
 
 void cuda_malloc( void **ptr, int size, int memset, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
+    cudaError_t retVal;
 
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
@@ -45,7 +45,8 @@ void cuda_malloc( void **ptr, int size, int memset, int err_code )
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
 
-    if ( memset ) {
+    if ( memset )
+    {
         retVal = cudaMemset( *ptr, 0, size );
         if ( retVal != cudaSuccess )
         {
@@ -59,8 +60,12 @@ void cuda_malloc( void **ptr, int size, int memset, int err_code )
 
 void cuda_free( void *ptr, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
-    if (!ptr) return;
+    cudaError_t retVal;
+
+    if (!ptr)
+    {
+        return;
+    }
 
     retVal = cudaFree( ptr );
 
@@ -75,9 +80,10 @@ void cuda_free( void *ptr, int err_code )
 
 void cuda_memset( void *ptr, int data, size_t count, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
+    cudaError_t retVal;
 
     retVal = cudaMemset( ptr, data, count );
+
     if (retVal != cudaSuccess) {
         fprintf( stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr );
         fprintf( stderr, " size to memset: %d \n", count );
@@ -91,7 +97,7 @@ void cuda_memset( void *ptr, int data, size_t count, int err_code )
 
 void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid )
 {
-    cudaError_t retVal = cudaErrorNotReady;
+    cudaError_t retVal;
 
     if ( dir == cudaMemcpyHostToDevice )
     {
@@ -112,9 +118,10 @@ void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir,
 
 void copy_device( void *dest, void *src, int size, int resid )
 {
-    cudaError_t retVal = cudaErrorNotReady;
+    cudaError_t retVal;
 
     retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
+
     if ( retVal != cudaSuccess )
     {
         fprintf( stderr, "could not copy resource %d from host to device: reason %d \n",
@@ -134,6 +141,7 @@ void compute_blocks( int *blocks, int *block_size, int count )
 void compute_nearest_pow_2( int blocks, int *result )
 {
     int power = 1;
+
     while (power < blocks)
     {
         power *= 2;
@@ -146,7 +154,9 @@ void compute_nearest_pow_2( int blocks, int *result )
 void print_device_mem_usage( )
 {
     size_t total, free;
+
     cudaMemGetInfo( &free, &total );
+
     if ( cudaGetLastError() != cudaSuccess )
     {
         fprintf( stderr, "Error on the memory call \n" );
diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/cuda_validation.cu
similarity index 99%
rename from PuReMD-GPU/src/validation.cu
rename to PuReMD-GPU/src/cuda_validation.cu
index 21cd2145e689621ee0b3827889b106ed7c05af7f..b5348eba0871017a7225fc311e98d95531136792 100644
--- a/PuReMD-GPU/src/validation.cu
+++ b/PuReMD-GPU/src/cuda_validation.cu
@@ -18,13 +18,13 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "validation.h"
+#include "cuda_validation.h"
 
-#include "cuda_utils.h"
+#include "index_utils.h"
 #include "list.h"
-
 #include "sort.h"
-#include "index_utils.h"
+
+#include "cuda_utils.h"
 
 
 int check_zero (real p1, real p2)
diff --git a/PuReMD-GPU/src/validation.h b/PuReMD-GPU/src/cuda_validation.h
similarity index 100%
rename from PuReMD-GPU/src/validation.h
rename to PuReMD-GPU/src/cuda_validation.h
diff --git a/PuReMD-GPU/src/param.c b/PuReMD-GPU/src/ffield.c
similarity index 58%
rename from PuReMD-GPU/src/param.c
rename to PuReMD-GPU/src/ffield.c
index 42e9ef612ec6c81d593f74acd796a7f564f96896..a5377e6f2b75dc6c6bec586d36517127bca40d50 100644
--- a/PuReMD-GPU/src/param.c
+++ b/PuReMD-GPU/src/ffield.c
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -18,85 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "param.h"
-#include "traj.h"
-#include "ctype.h"
-
-
-int Get_Atom_Type( reax_interaction *reaxprm, char *s )
-{
-    int i;
-
-    for ( i = 0; i < reaxprm->num_atom_types; ++i )
-        if ( !strcmp( reaxprm->sbp[i].name, s ) )
-            return i;
-
-    fprintf( stderr, "Unknown atom type %s. Terminating...\n", s );
-    exit( UNKNOWN_ATOM_TYPE_ERR );
-}
-
-
-int Tokenize(char* s, char*** tok)
-{
-    char test[MAX_LINE];
-    char *sep = "\t \n!=";
-    char *word;
-    int count = 0;
-
-    strncpy( test, s, MAX_LINE );
-
-    // fprintf( stderr, "|%s|\n", test );
-
-    for ( word = strtok(test, sep); word; word = strtok(NULL, sep) )
-    {
-        strncpy( (*tok)[count], word, MAX_LINE );
-        count++;
-    }
-
-    return count;
-}
-
-
-/* Initialize Taper params */
-void Init_Taper( control_params *control )
-{
-    real d1, d7;
-    real swa, swa2, swa3;
-    real swb, swb2, swb3;
-
-    swa = control->r_low;
-    swb = control->r_cut;
-
-    if ( fabs( swa ) > 0.01 )
-        fprintf( stderr, "Warning: non-zero value for lower Taper-radius cutoff\n" );
-
-    if ( swb < 0 )
-    {
-        fprintf( stderr, "Negative value for upper Taper-radius cutoff\n" );
-        exit( INVALID_INPUT );
-    }
-    else if ( swb < 5 )
-        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n",
-                 swb );
-
-    d1 = swb - swa;
-    d7 = POW( d1, 7.0 );
-    swa2 = SQR( swa );
-    swa3 = CUBE( swa );
-    swb2 = SQR( swb );
-    swb3 = CUBE( swb );
-
-    control->Tap7 =  20.0 / d7;
-    control->Tap6 = -70.0 * (swa + swb) / d7;
-    control->Tap5 =  84.0 * (swa2 + 3.0 * swa * swb + swb2) / d7;
-    control->Tap4 = -35.0 * (swa3 + 9.0 * swa2 * swb + 9.0 * swa * swb2 + swb3 ) / d7;
-    control->Tap3 = 140.0 * (swa3 * swb + 3.0 * swa2 * swb2 + swa * swb3 ) / d7;
-    control->Tap2 = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
-    control->Tap1 = 140.0 * swa3 * swb3 / d7;
-    control->Tap0 = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                     7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
-}
+#include <ctype.h>
 
+#include "ffield.h"
+#include "tool_box.h"
 
 
 char Read_Force_Field( FILE* fp, reax_interaction* reax )
@@ -106,20 +32,20 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     char *tor_flag;
     int c, i, j, k, l, m, n, o, p, cnt;
     real val;
-
     int __N;
     int index1, index2;
 
     s = (char*) malloc(sizeof(char) * MAX_LINE);
     tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
     for (i = 0; i < MAX_TOKENS; i++)
+    {
         tmp[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN);
+    }
 
 
     /* reading first header comment */
     fgets( s, MAX_LINE, fp );
 
-
     /* line 2 is number of global parameters */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
@@ -129,7 +55,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     if (n < 1)
     {
         fprintf( stderr, "WARNING: number of globals in ffield file is 0!\n" );
-        return 1;
+        exit( INVALID_INPUT );
     }
 
     reax->gp.n_global = n;
@@ -146,20 +72,17 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         reax->gp.l[i] = val;
     }
 
-
     /* next line is number of atom types and some comments */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
     reax->num_atom_types = atoi(tmp[0]);
     __N = reax->num_atom_types;
 
-
     /* 3 lines of comments */
     fgets(s, MAX_LINE, fp);
     fgets(s, MAX_LINE, fp);
     fgets(s, MAX_LINE, fp);
 
-
     /* Allocating structures in reax_interaction */
     reax->sbp = (single_body_parameters*)
                 calloc( reax->num_atom_types, sizeof(single_body_parameters) );
@@ -194,7 +117,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         c = Tokenize( s, &tmp );
 
         for ( j = 0; j < strlen( tmp[0] ); ++j )
+        {
             reax->sbp[i].name[j] = toupper( tmp[0][j] );
+        }
 
         val = atof(tmp[1]);
         reax->sbp[i].r_s        = val;
@@ -281,6 +206,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             if ( reax->sbp[i].gamma_w > 0.5 ) // Shielding vdWaals
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 3 )
+                {
                     fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
                              "Force field parameters for element %s\n"        \
                              "indicate inner wall+shielding, but earlier\n"   \
@@ -288,9 +214,11 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                              "This may cause division-by-zero errors.\n"      \
                              "Keeping vdWaals-setting for earlier atoms.\n",
                              reax->sbp[i].name );
+                }
                 else
                 {
                     reax->gp.vdw_type = 3;
+
 #if defined(DEBUG)
                     fprintf( stderr, "vdWaals type for element %s: Shielding+inner-wall",
                              reax->sbp[i].name );
@@ -300,6 +228,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             else    // No shielding vdWaals parameters present
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 2 )
+                {
                     fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
                              "Force field parameters for element %s\n"        \
                              "indicate inner wall without shielding, but earlier\n" \
@@ -307,9 +236,11 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                              "This may cause division-by-zero errors.\n"      \
                              "Keeping vdWaals-setting for earlier atoms.\n",
                              reax->sbp[i].name );
+                }
                 else
                 {
                     reax->gp.vdw_type = 2;
+
 #if defined(DEBUG)
                     fprintf( stderr, "vdWaals type for element%s: No Shielding,inner-wall",
                              reax->sbp[i].name );
@@ -348,7 +279,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* next line is number of two body combination and some comments */
     fgets(s, MAX_LINE, fp);
     c = Tokenize(s, &tmp);
@@ -430,6 +360,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     /* calculating combination rules and filling up remaining fields. */
 
     for (i = 0; i < reax->num_atom_types; i++)
+    {
         for (j = i; j < reax->num_atom_types; j++)
         {
             index1 = i * __N + j;
@@ -450,7 +381,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             reax->tbp[index2].r_pp = 0.5 *
                                      (reax->sbp[j].r_pi_pi + reax->sbp[i].r_pi_pi);
 
-
             reax->tbp[index1].p_boc3 =
                 sqrt(reax->sbp[i].b_o_132 *
                      reax->sbp[j].b_o_132);
@@ -472,7 +402,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                 sqrt(reax->sbp[j].b_o_133 *
                      reax->sbp[i].b_o_133);
 
-
             reax->tbp[index1].D =
                 sqrt(reax->sbp[i].epsilon *
                      reax->sbp[j].epsilon);
@@ -505,9 +434,8 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             reax->tbp[index2].gamma =
                 POW(reax->sbp[j].gamma *
                     reax->sbp[i].gamma, -1.5);
-
         }
-
+    }
 
     /* next line is number of 2-body offdiagonal combinations and some comments */
     /* these are two body offdiagonal terms that are different from the
@@ -572,7 +500,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* 3-body parameters -
        supports multi-well potentials (upto MAX_3BODY_PARAM in mytypes.h) */
     /* clear entries first */
@@ -636,7 +563,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* 4-body parameters are entered in compact form. i.e. 0-X-Y-0
        correspond to any type of pair of atoms in 1 and 4
        position. However, explicit X-Y-Z-W takes precedence over the
@@ -647,13 +573,19 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
 
     /* clear all entries first */
     for ( i = 0; i < reax->num_atom_types; ++i )
+    {
         for ( j = 0; j < reax->num_atom_types; ++j )
+        {
             for ( k = 0; k < reax->num_atom_types; ++k )
+            {
                 for ( m = 0; m < reax->num_atom_types; ++m )
                 {
                     reax->fbp[i * __N * __N * __N + j * __N * __N + k * __N + m].cnt = 0;
                     tor_flag[i * __N * __N * __N + j * __N * __N + k * __N + m] = 0;
                 }
+            }
+        }
+    }
 
     /* next line is number of 4-body params and some comments */
     fgets( s, MAX_LINE, fp );
@@ -714,7 +646,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         else /* This means the entry is of the form 0-X-Y-0 */
         {
             if ( k < reax->num_atom_types && m < reax->num_atom_types )
+            {
                 for ( p = 0; p < reax->num_atom_types; p++ )
+                {
                     for ( o = 0; o < reax->num_atom_types; o++ )
                     {
                         index1 = p * __N * __N * __N + k * __N * __N + m * __N + o;
@@ -743,11 +677,12 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                             reax->fbp[index2].prm[0].p_cot1 = atof(tmp[8]);
                         }
                     }
+                }
+            }
         }
     }
 
 
-
     /* next line is number of hydrogen bond params and some comments */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
@@ -781,14 +716,14 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
+    {
         free( tmp[i] );
+    }
     free( tmp );
     free( s );
 
-
     /* deallocate tor_flag */
     free( tor_flag );
 
@@ -796,497 +731,5 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     fprintf( stderr, "force field read\n" );
 #endif
 
-    return 0;
-}
-
-
-char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
-                        output_controls *out_control )
-{
-    char *s, **tmp;
-    int c, i;
-    real val;
-    int ival;
-
-    /* assign default values */
-    strcpy( control->sim_name, "default.sim" );
-
-    control->restart = 0;
-    out_control->restart_format = 1;
-    out_control->restart_freq = 0;
-    strcpy( control->restart_from, "default.res" );
-    out_control->restart_freq = 0;
-    control->random_vel = 0;
-
-    control->reposition_atoms = 0;
-
-    control->ensemble = 0;
-    control->nsteps = 0;
-    control->dt = 0.25;
-
-    control->geo_format = 1;
-    control->restrict_bonds = 0;
-
-    control->periodic_boundaries = 1;
-    control->periodic_images[0] = 0;
-    control->periodic_images[1] = 0;
-    control->periodic_images[2] = 0;
-
-    control->reneighbor = 1;
-    control->vlist_cut = 0;
-    control->nbr_cut = 4.;
-    control->r_cut = 10;
-    control->max_far_nbrs = 1000;
-    control->bo_cut = 0.01;
-    control->thb_cut = 0.001;
-    control->hb_cut = 7.50;
-
-    control->q_err = 0.000001;
-    control->tabulate = 0;
-    //TODO
-    control->refactor = 100;
-    //TODO -- change this to 5.
-
-    control->droptol = 0.01;
-
-    control->T_init = 0.;
-    control->T_final = 300.;
-    control->Tau_T = 1.0;
-    control->T_mode = 0.;
-    control->T_rate = 1.;
-    control->T_freq = 1.;
-
-    control->P[0] = 0.000101325;
-    control->P[1] = 0.000101325;
-    control->P[2] = 0.000101325;
-    control->Tau_P[0]  = 500.0;
-    control->Tau_P[1]  = 500.0;
-    control->Tau_P[2]  = 500.0;
-    control->Tau_PT = 500.0;
-    control->compressibility = 1.0;
-    control->press_mode = 0;
-
-    control->remove_CoM_vel = 25;
-
-    out_control->debug_level = 0;
-    out_control->energy_update_freq = 10;
-
-    out_control->write_steps = 100;
-    out_control->traj_compress = 0;
-    out_control->write = fprintf;
-    out_control->traj_format = 0;
-    out_control->write_header =
-        (int (*)( reax_system*, control_params*,
-                  static_storage*, void* )) Write_Custom_Header;
-    out_control->append_traj_frame =
-        (int (*)( reax_system*, control_params*, simulation_data*,
-                  static_storage*, list **, void* )) Append_Custom_Frame;
-
-    strcpy( out_control->traj_title, "default_title" );
-    out_control->atom_format = 0;
-    out_control->bond_info = 0;
-    out_control->angle_info = 0;
-
-    control->molec_anal = 0;
-    control->freq_molec_anal = 0;
-    control->bg_cut = 0.3;
-    control->num_ignored = 0;
-    memset( control->ignore, 0, sizeof(int)*MAX_ATOM_TYPES );
-
-    control->dipole_anal = 0;
-    control->freq_dipole_anal = 0;
-
-    control->diffusion_coef = 0;
-    control->freq_diffusion_coef = 0;
-    control->restrict_type = 0;
-
-    /* memory allocations */
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
-    for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
-
-    /* read control parameters file */
-    while (!feof(fp))
-    {
-        fgets(s, MAX_LINE, fp);
-        c = Tokenize(s, &tmp);
-
-        if ( strcmp(tmp[0], "simulation_name") == 0 )
-        {
-            strcpy( control->sim_name, tmp[1] );
-        }
-        //else if( strcmp(tmp[0], "restart") == 0 ) {
-        //  ival = atoi(tmp[1]);
-        //  control->restart = ival;
-        //}
-        else if ( strcmp(tmp[0], "restart_format") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->restart_format = ival;
-        }
-        else if ( strcmp(tmp[0], "restart_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->restart_freq = ival;
-        }
-        else if ( strcmp(tmp[0], "random_vel") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->random_vel = ival;
-        }
-        else if ( strcmp(tmp[0], "reposition_atoms") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->reposition_atoms = ival;
-        }
-        else if ( strcmp(tmp[0], "ensemble_type") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->ensemble = ival;
-        }
-        else if ( strcmp(tmp[0], "nsteps") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->nsteps = ival;
-        }
-        else if ( strcmp(tmp[0], "dt") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->dt = val * 1.e-3;  // convert dt from fs to ps!
-        }
-        else if ( strcmp(tmp[0], "periodic_boundaries") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->periodic_boundaries = ival;
-        }
-        else if ( strcmp(tmp[0], "periodic_images") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->periodic_images[0] = ival;
-            ival = atoi(tmp[2]);
-            control->periodic_images[1] = ival;
-            ival = atoi(tmp[3]);
-            control->periodic_images[2] = ival;
-        }
-        else if ( strcmp(tmp[0], "geo_format") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->geo_format = ival;
-        }
-        else if ( strcmp(tmp[0], "restrict_bonds") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->restrict_bonds = ival;
-        }
-        else if ( strcmp(tmp[0], "tabulate_long_range") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->tabulate = ival;
-        }
-        else if ( strcmp(tmp[0], "reneighbor") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->reneighbor = ival;
-        }
-        else if ( strcmp(tmp[0], "vlist_buffer") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->vlist_cut = val;
-        }
-        else if ( strcmp(tmp[0], "nbrhood_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->nbr_cut = val;
-        }
-        else if ( strcmp(tmp[0], "thb_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->thb_cut = val;
-        }
-        else if ( strcmp(tmp[0], "hbond_cutoff") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->hb_cut = val;
-        }
-        else if ( strcmp(tmp[0], "q_err") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->q_err = val;
-        }
-        else if ( strcmp(tmp[0], "ilu_refactor") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->refactor = ival;
-        }
-        else if ( strcmp(tmp[0], "ilu_droptol") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->droptol = val;
-        }
-        else if ( strcmp(tmp[0], "temp_init") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_init = val;
-
-            if ( control->T_init < 0.001 )
-                control->T_init = 0.001;
-        }
-        else if ( strcmp(tmp[0], "temp_final") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_final = val;
-
-            if ( control->T_final < 0.1 )
-                control->T_final = 0.1;
-        }
-        else if ( strcmp(tmp[0], "t_mass") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->Tau_T = val * 1.e-3;    // convert t_mass from fs to ps
-        }
-        else if ( strcmp(tmp[0], "t_mode") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->T_mode = ival;
-        }
-        else if ( strcmp(tmp[0], "t_rate") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_rate = val;
-        }
-        else if ( strcmp(tmp[0], "t_freq") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_freq = val;
-        }
-        else if ( strcmp(tmp[0], "pressure") == 0 )
-        {
-            if ( control->ensemble == iNPT )
-            {
-                val = atof(tmp[1]);
-                control->P[0] = control->P[1] = control->P[2] = val;
-            }
-            else if ( control->ensemble == sNPT )
-            {
-                val = atof(tmp[1]);
-                control->P[0] = val;
-
-                val = atof(tmp[2]);
-                control->P[1] = val;
-
-                val = atof(tmp[3]);
-                control->P[2] = val;
-            }
-        }
-        else if ( strcmp(tmp[0], "p_mass") == 0 )
-        {
-            if ( control->ensemble == iNPT )
-            {
-                val = atof(tmp[1]);
-                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
-            }
-            else if ( control->ensemble == sNPT )
-            {
-                val = atof(tmp[1]);
-                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
-
-                val = atof(tmp[2]);
-                control->Tau_P[1] = val * 1.e-3;   // convert p_mass from fs to ps
-
-                val = atof(tmp[3]);
-                control->Tau_P[2] = val * 1.e-3;   // convert p_mass from fs to ps
-            }
-        }
-        else if ( strcmp(tmp[0], "pt_mass") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->Tau_PT = val * 1.e-3;  // convert pt_mass from fs to ps
-        }
-        else if ( strcmp(tmp[0], "compress") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->compressibility = val;
-        }
-        else if ( strcmp(tmp[0], "press_mode") == 0 )
-        {
-            val = atoi(tmp[1]);
-            control->press_mode = val;
-        }
-        else if ( strcmp(tmp[0], "remove_CoM_vel") == 0 )
-        {
-            val = atoi(tmp[1]);
-            control->remove_CoM_vel = val;
-        }
-        else if ( strcmp(tmp[0], "debug_level") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->debug_level = ival;
-        }
-        else if ( strcmp(tmp[0], "energy_update_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->energy_update_freq = ival;
-        }
-        else if ( strcmp(tmp[0], "write_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->write_steps = ival;
-        }
-        else if ( strcmp(tmp[0], "traj_compress") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->traj_compress = ival;
-
-            if ( out_control->traj_compress )
-                out_control->write = (int (*)(FILE *, const char *, ...)) gzprintf;
-            else out_control->write = fprintf;
-        }
-        else if ( strcmp(tmp[0], "traj_format") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->traj_format = ival;
-
-            if ( out_control->traj_format == 0 )
-            {
-                out_control->write_header =
-                    (int (*)( reax_system*, control_params*,
-                              static_storage*, void* )) Write_Custom_Header;
-                out_control->append_traj_frame =
-                    (int (*)(reax_system*, control_params*, simulation_data*,
-                             static_storage*, list **, void*)) Append_Custom_Frame;
-            }
-            else if ( out_control->traj_format == 1 )
-            {
-                out_control->write_header =
-                    (int (*)( reax_system*, control_params*,
-                              static_storage*, void* )) Write_xyz_Header;
-                out_control->append_traj_frame =
-                    (int (*)( reax_system*,  control_params*, simulation_data*,
-                              static_storage*, list **, void* )) Append_xyz_Frame;
-            }
-        }
-        else if ( strcmp(tmp[0], "traj_title") == 0 )
-        {
-            strcpy( out_control->traj_title, tmp[1] );
-        }
-        else if ( strcmp(tmp[0], "atom_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 4;
-        }
-        else if ( strcmp(tmp[0], "atom_velocities") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 2;
-        }
-        else if ( strcmp(tmp[0], "atom_forces") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 1;
-        }
-        else if ( strcmp(tmp[0], "bond_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->bond_info = ival;
-        }
-        else if ( strcmp(tmp[0], "angle_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->angle_info = ival;
-        }
-        else if ( strcmp(tmp[0], "test_forces") == 0 )
-        {
-            ival = atoi(tmp[1]);
-        }
-        else if ( strcmp(tmp[0], "molec_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->molec_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_molec_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_molec_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "bond_graph_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->bg_cut = val;
-        }
-        else if ( strcmp(tmp[0], "ignore") == 0 )
-        {
-            control->num_ignored = atoi(tmp[1]);
-            for ( i = 0; i < control->num_ignored; ++i )
-                control->ignore[atoi(tmp[i + 2])] = 1;
-        }
-        else if ( strcmp(tmp[0], "dipole_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->dipole_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_dipole_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_dipole_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "diffusion_coef") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->diffusion_coef = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_diffusion_coef") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_diffusion_coef = ival;
-        }
-        else if ( strcmp(tmp[0], "restrict_type") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->restrict_type = ival;
-        }
-        else
-        {
-            fprintf( stderr, "WARNING: unknown parameter %s\n", tmp[0] );
-            exit( 15 );
-        }
-    }
-
-
-    /* determine target T */
-    if ( control->T_mode == 0 )
-        control->T = control->T_final;
-    else control->T = control->T_init;
-
-
-    /* near neighbor and far neighbor cutoffs */
-    control->bo_cut = 0.01 * system->reaxprm.gp.l[29];
-    control->r_low  = system->reaxprm.gp.l[11];
-    control->r_cut  = system->reaxprm.gp.l[12];
-    control->vlist_cut += control->r_cut;
-
-    system->g.cell_size = control->vlist_cut / 2.;
-    for ( i = 0; i < 3; ++i )
-        system->g.spread[i] = 2;
-
-
-    /* Initialize Taper function */
-    Init_Taper( control );
-
-
-    /* free memory allocations at the top */
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr,
-             "en=%d steps=%d dt=%.5f opt=%d T=%.5f P=%.5f %.5f %.5f\n",
-             control->ensemble, control->nsteps, control->dt, control->tabulate,
-             control->T, control->P[0], control->P[1], control->P[2] );
-
-    fprintf(stderr, "control file read\n" );
-#endif
-    return 0;
+    return SUCCESS;
 }
diff --git a/PuReMD-GPU/src/param.h b/PuReMD-GPU/src/ffield.h
similarity index 67%
rename from PuReMD-GPU/src/param.h
rename to PuReMD-GPU/src/ffield.h
index 2b24b056983233840966a8de29ce902ca6beb981..4aaf32a644861b069e8cf87e2eec68aadf4d3c84 100644
--- a/PuReMD-GPU/src/param.h
+++ b/PuReMD-GPU/src/ffield.h
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -18,24 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __PARAM_H_
-#define __PARAM_H_
+#ifndef __FFIELD_H_
+#define __FFIELD_H_
 
 #include "mytypes.h"
-
-#define MAX_LINE 1024
-#define MAX_TOKENS 20
-#define MAX_TOKEN_LEN 1024
-
-
-int Get_Atom_Type( reax_interaction*, char* );
-
-int Tokenize( char*, char*** );
-
 char Read_Force_Field( FILE*, reax_interaction* );
 
-char Read_Control_File( FILE*, reax_system*, control_params*,
-        output_controls* );
-
-
 #endif
diff --git a/PuReMD-GPU/src/forces.c b/PuReMD-GPU/src/forces.c
index c95d4896e32f60e954d79b0b623520afb042e9ea..debe6ac171f281a9f09a77c2c17ec1d315bc7f9b 100644
--- a/PuReMD-GPU/src/forces.c
+++ b/PuReMD-GPU/src/forces.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -26,30 +27,31 @@
 #include "two_body_interactions.h"
 #include "three_body_interactions.h"
 #include "four_body_interactions.h"
+#include "index_utils.h"
 #include "list.h"
 #include "print_utils.h"
+#include "qeq.h"
 #include "system_props.h"
-#include "QEq.h"
+#include "tool_box.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Dummy_Interaction( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+void Dummy_Interaction( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
 }
 
 
 void Init_Bonded_Force_Functions( control_params *control )
-{ 
+{
     Interaction_Functions[0] = Calculate_Bond_Orders;
     Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
     Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
     //*/Dummy_Interaction;
     Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
     Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
-    if( control->hb_cut > 0 )
+    if ( control->hb_cut > 0 )
         Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
     else Interaction_Functions[5] = Dummy_Interaction;
     Interaction_Functions[6] = Dummy_Interaction; //empty
@@ -59,127 +61,123 @@ void Init_Bonded_Force_Functions( control_params *control )
 }
 
 
-void Compute_Bonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
+void Compute_Bonded_Forces( reax_system *system, control_params *control,
+                            simulation_data *data, static_storage *workspace,
+                            list **lists, output_controls *out_control )
 {
 
     int i;
-    real t_start, t_elapsed;
+    // real t_start, t_end, t_elapsed;
 
 #ifdef TEST_ENERGY
     /* Mark beginning of a new timestep in each energy file */
-    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "bo", "ebond", "total" );
-    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
-            data->step, "atom", "nlp", "elp", "total" );
-    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eov", "total" );
-    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eun", "total" );
-    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
-    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "epen", "total" );
-    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "ecoa", "total" );
-    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "r(23)", "angle", "bo(12)", "ehb", "total" );
-    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(23)", "etor", "total" );
+    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "bo", "ebond", "total" );
+    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n",
+             data->step, "atom", "nlp", "elp", "total" );
+    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n",
+             data->step, "atom", "eov", "total" );
+    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n",
+             data->step, "atom", "eun", "total" );
+    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
+    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "epen", "total" );
+    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "ecoa", "total" );
+    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "r(23)", "angle", "bo(12)", "ehb", "total" );
+    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3", "atom4",
+             "phi", "bo(23)", "etor", "total" );
     fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
-#endif 
-
-    /* Implement all the function calls as function pointers */
-    for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
-        //for( i = 0; i < 5; i++ ) {
-        t_start = Get_Time ();
-        (Interaction_Functions[i])(system, control, data, workspace, 
-                lists, out_control);
-        t_elapsed = Get_Timing_Info ( t_start );
-
-#ifdef __DEBUG_CUDA__
-        fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
+             data->step, "atom1", "atom2", "atom3", "atom4",
+             "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 #endif
 
+    /* Implement all the function calls as function pointers */
+    for ( i = 0; i < NO_OF_INTERACTIONS; i++ )
+    {
+        (Interaction_Functions[i])(system, control, data, workspace,
+                                   lists, out_control);
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "f%d-", i );
 #endif
 #ifdef TEST_FORCES
-        (Print_Interactions[i])(system, control, data, workspace, 
-                lists, out_control);
+        (Print_Interactions[i])(system, control, data, workspace,
+                                lists, out_control);
 #endif
     }
 }
 
 
-void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data,static_storage *workspace,
-        list** lists, output_controls *out_control )
+void Compute_NonBonded_Forces( reax_system *system, control_params *control,
+                               simulation_data *data, static_storage *workspace,
+                               list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
 #ifdef TEST_ENERGY
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "r12", "evdw", "total" );
+             data->step, "atom1", "atom2", "r12", "evdw", "total" );
     fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+             data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
 #endif
 
     t_start = Get_Time( );
     QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.QEq += t_elapsed;
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "qeq - " );
 #endif
 
     if ( control->tabulate == 0)
+    {
         vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+    }
     else
-        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
-                lists, out_control );
-
+    {
+        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
+                                      lists, out_control );
+    }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonb forces - " );
 #endif
 
 #ifdef TEST_FORCES
-    Print_vdW_Coulomb_Forces( system, control, data, workspace, 
-            lists, out_control );
+    Print_vdW_Coulomb_Forces( system, control, data, workspace,
+                              lists, out_control );
 #endif
 }
 
 
-/* This version of Compute_Total_Force computes forces from coefficients 
+/* This version of Compute_Total_Force computes forces from coefficients
    accumulated by all interaction functions. Saves enormous time & space! */
-void Compute_Total_Force( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists )
+void Compute_Total_Force( reax_system *system, control_params *control,
+                          simulation_data *data, static_storage *workspace,
+                          list **lists )
 {
     int i, pj;
     list *bonds = (*lists) + BONDS;
 
-    for( i = 0; i < system->N; ++i )
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-            if( i < bonds->select.bond_list[pj].nbr ) {
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
+    for ( i = 0; i < system->N; ++i )
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            if ( i < bonds->select.bond_list[pj].nbr )
+            {
+                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
                     Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-                else 
+                else
                     Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
             }
 }
 
 
 void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
-        int Hmax, int Htop, int num_bonds, int num_hbonds )
+                     int Hmax, int Htop, int num_bonds, int num_hbonds )
 {
     int i, flag;
     list *bonds, *hbonds;
@@ -188,92 +186,104 @@ void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
     hbonds = *lists + HBONDS;
 
     /* far neighbors */
-    if( Htop > Hmax * DANGER_ZONE ) {
+    if ( Htop > Hmax * DANGER_ZONE )
+    {
         workspace->realloc.Htop = Htop;
-        if( Htop > Hmax ) {
-            fprintf( stderr, 
-                    "step%d - ran out of space on H matrix: Htop=%d, max = %d",
-                    step, Htop, Hmax );
-            exit(INSUFFICIENT_SPACE);
+        if ( Htop > Hmax )
+        {
+            fprintf( stderr,
+                     "step%d - ran out of space on H matrix: Htop=%d, max = %d",
+                     step, Htop, Hmax );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
     /* bond list */
     flag = -1;
     workspace->realloc.num_bonds = num_bonds;
-    for( i = 0; i < n-1; ++i )
-        if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
+    for ( i = 0; i < n - 1; ++i )
+        if ( End_Index(i, bonds) >= Start_Index(i + 1, bonds) - 2 )
+        {
             workspace->realloc.bonds = 1;
-            if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
+            if ( End_Index(i, bonds) > Start_Index(i + 1, bonds) )
                 flag = i;
         }
 
-    if( flag > -1 ) {
+    if ( flag > -1 )
+    {
         fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
-        exit(INSUFFICIENT_SPACE);
-    }    
+                 step, flag, End_Index(flag, bonds), Start_Index(flag + 1, bonds) );
+        exit( INSUFFICIENT_MEMORY );
+    }
 
-    if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
+    if ( End_Index(i, bonds) >= bonds->num_intrs - 2 )
+    {
         workspace->realloc.bonds = 1;
 
-        if( End_Index(i, bonds) > bonds->num_intrs ) {
+        if ( End_Index(i, bonds) > bonds->num_intrs )
+        {
             fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-                    step, flag, End_Index(i,bonds), bonds->num_intrs );
-            exit(INSUFFICIENT_SPACE);
+                     step, flag, End_Index(i, bonds), bonds->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
 
     /* hbonds list */
-    if( workspace->num_H > 0 ) {
+    if ( workspace->num_H > 0 )
+    {
         flag = -1;
         workspace->realloc.num_hbonds = num_hbonds;
-        for( i = 0; i < workspace->num_H-1; ++i )
-            if( Num_Entries(i, hbonds) >= 
-                    (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
+        for ( i = 0; i < workspace->num_H - 1; ++i )
+            if ( Num_Entries(i, hbonds) >=
+                    (Start_Index(i + 1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE )
+            {
                 workspace->realloc.hbonds = 1;
-                if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
+                if ( End_Index(i, hbonds) > Start_Index(i + 1, hbonds) )
                     flag = i;
             }
 
-        if( flag > -1 ) {
+        if ( flag > -1 )
+        {
             fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                    step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
-            exit(INSUFFICIENT_SPACE);
+                     step, flag, End_Index(flag, hbonds), Start_Index(flag + 1, hbonds) );
+            exit( INSUFFICIENT_MEMORY );
         }
 
-        if( Num_Entries(i,hbonds) >= 
-                (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
+        if ( Num_Entries(i, hbonds) >=
+                (hbonds->num_intrs - Start_Index(i, hbonds)) * DANGER_ZONE )
+        {
             workspace->realloc.hbonds = 1;
 
-            if( End_Index(i, hbonds) > hbonds->num_intrs ) {
+            if ( End_Index(i, hbonds) > hbonds->num_intrs )
+            {
                 fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-                        step, flag, End_Index(i,hbonds), hbonds->num_intrs );
-                exit(INSUFFICIENT_SPACE);
+                         step, flag, End_Index(i, hbonds), hbonds->num_intrs );
+                exit( INSUFFICIENT_MEMORY );
             }
         }
     }
 }
 
 
-void Init_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control ) {
+void Init_Forces( reax_system *system, control_params *control,
+                  simulation_data *data, static_storage *workspace,
+                  list **lists, output_controls *out_control )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
     int ihb, jhb, ihb_top, jhb_top;
-    int flag;
+    int flag, flag_sp;
     real r_ij, r2, self_coef;
     real dr3gamij_1, dr3gamij_3, Tap;
     //real val, dif, base;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
+    real p_boc1, p_boc2;
+    sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -287,44 +297,67 @@ void Init_Forces( reax_system *system, control_params *control,
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
 
-    H = &workspace->H;
+    H = workspace->H;
+    H_sp = workspace->H_sp;
     Htop = 0;
+    H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
         H->start[i] = Htop;
+        H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
-        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+        if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
 
             flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if( nbr_pj->d <= control->r_cut)
+            flag_sp = 0;
+            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            {
+                if ( nbr_pj->d <= control->r_cut )
+                {
                     flag = 1;
-                else flag = 0;
+                    if ( nbr_pj->d <= control->r_sp_cut )
+                    {
+                        flag_sp = 1;
+                    }
+                }
+                else
+                {
+                    flag = 0;
+                    flag_sp = 0;
+                }
             }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                            nbr_pj->dvec))<=SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
+            else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
+                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+            {
+                if ( nbr_pj->d <= SQR(control->r_sp_cut))
+                {
+                    flag_sp = 1;
+                }
+                nbr_pj->d = SQRT( nbr_pj->d );
                 flag = 1;
             }
 
-            if( flag ){    
+            if ( flag )
+            {
                 type_j = system->atoms[j].type;
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
@@ -338,63 +371,79 @@ void Init_Forces( reax_system *system, control_params *control,
                 Tap = Tap * r_ij + control->Tap3;
                 Tap = Tap * r_ij + control->Tap2;
                 Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;          
+                Tap = Tap * r_ij + control->Tap0;
 
                 dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
                 dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
 
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+                H->j[Htop] = j;
+                H->val[Htop] = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
                 ++Htop;
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* H_sp matrix entry */
+                if ( flag_sp )
+                {
+                    H_sp->j[H_sp_top] = j;
+                    H_sp->val[H_sp_top] = H->val[Htop - 1];
+                    ++H_sp_top;
+                }
+
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     // fprintf( stderr, "%d %d\n", atom1, atom2 );
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
+                    if ( ihb == 1 && jhb == 2 )
+                    {
                         hbonds->select.hbond_list[ihb_top].nbr = j;
                         hbonds->select.hbond_list[ihb_top].scl = 1;
                         hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                         ++ihb_top;
                         ++num_hbonds;
                     }
-                    else if( ihb == 2 && jhb == 1 ) {
+                    else if ( ihb == 2 && jhb == 1 )
+                    {
                         jhb_top = End_Index( workspace->hbond_index[j], hbonds );
                         hbonds->select.hbond_list[jhb_top].nbr = i;
                         hbonds->select.hbond_list[jhb_top].scl = -1;
                         hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        Set_End_Index( workspace->hbond_index[j], jhb_top + 1, hbonds );
                         ++num_hbonds;
                     }
                 }
 
                 /* uncorrected bond orders */
-                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
+                {
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         num_bonds += 2;
                         /****** bonds i-j and j-i ******/
                         ibond = &( bonds->select.bond_list[btop_i] );
@@ -414,7 +463,7 @@ void Init_Forces( reax_system *system, control_params *control,
                         ibond->sym_index = btop_j;
                         jbond->sym_index = btop_i;
                         ++btop_i;
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
 
                         bo_ij = &( ibond->bo_data );
                         bo_ji = &( jbond->bo_data );
@@ -428,22 +477,22 @@ void Init_Forces( reax_system *system, control_params *control,
                         Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
                         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
                         rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
                         rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
                         rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
                         rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
-                        /* Only dBOp wrt. dr_i is stored here, note that 
+                        /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ij->dBOp,
+                                    -(bo_ij->BO_s * Cln_BOp_s +
+                                      bo_ij->BO_pi * Cln_BOp_pi +
+                                      bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
                         rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
 
                         rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
@@ -461,79 +510,91 @@ void Init_Forces( reax_system *system, control_params *control,
                         /*fprintf( stderr, "%d %d %g %g %g\n",
                           i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
 
-                        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
+                        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n",
                           Cln_BOp_s, twbp->p_bo2, C12 );
-                          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
+                          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n",
                           Cln_BOp_pi, twbp->p_bo4, C34 );
                           fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
                           Cln_BOp_pi2, twbp->p_bo6, C56 );*/
                         /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
                           fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
                           fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
-                          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
+                          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n",
                           twbp->r_s, twbp->r_p, twbp->r_pp );
                           fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
 
                         /*fprintf( stderr, "\tfactors: %g %g %g\n",
-                          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
+                          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi +
                           bo_ij->BO_pi2 * Cln_BOp_pp),
                           -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
-                        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
                           bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
-                          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-                          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
+                          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
+                          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1],
                           bo_ij->dln_BOp_pi[2] );
                           fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
-                          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
+                          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1],
                           bo_ij->dln_BOp_pi2[2] );*/
 
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
                     }
                 }
             }
         }
 
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        /* diagonal entry */
+        H->j[Htop] = i;
+        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
         ++Htop;
 
+        /* diagonal entry */
+        H_sp->j[H_sp_top] = i;
+        H_sp->val[H_sp_top] = H->val[Htop - 1];
+        ++H_sp_top;
+
         Set_End_Index( i, btop_i, bonds );
-        if( ihb == 1 )
+        if ( ihb == 1 )
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-        //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
+        //fprintf( stderr, "%d bonds start: %d, end: %d\n",
         //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
     }
 
+//    printf("Htop = %d\n", Htop);
+//    printf("H_sp_top = %d\n", H_sp_top);
+
     // mark the end of j list
-    H->start[i] = Htop; 
+    H->start[i] = Htop;
+    H_sp->start[i] = H_sp_top;
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, 
-            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+    Validate_Lists( workspace, lists,
+                    data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-            data->step, Htop, num_bonds, num_hbonds );
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
+             data->step, Htop, num_bonds, num_hbonds );
+
 #endif
 }
 
 
-void Init_Forces_Tab( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control ) {
+void Init_Forces_Tab( reax_system *system, control_params *control,
+                      simulation_data *data, static_storage *workspace,
+                      list **lists, output_controls *out_control )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
     int tmin, tmax, r;
     int ihb, jhb, ihb_top, jhb_top;
-    int flag;
+    int flag, flag_sp;
     real r_ij, r2, self_coef;
     real val, dif, base;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
+    real p_boc1, p_boc2;
+    sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -547,44 +608,67 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
 
-    H = &workspace->H;
+    H = workspace->H;
+    H_sp = workspace->H_sp;
     Htop = 0;
+    H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
         H->start[i] = Htop;
+        H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
-        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+        if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
 
             flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if(nbr_pj->d <= control->r_cut)
+            flag_sp = 0;
+            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            {
+                if (nbr_pj->d <= control->r_cut)
+                {
                     flag = 1;
-                else flag = 0;
+                    if ( nbr_pj->d <= control->r_sp_cut )
+                    {
+                        flag_sp = 1;
+                    }
+                }
+                else
+                {
+                    flag = 0;
+                    flag_sp = 0;
+                }
             }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                            nbr_pj->dvec))<=SQR(control->r_cut)){
+            else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
+                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+            {
+                if ( nbr_pj->d <= SQR(control->r_sp_cut))
+                {
+                    flag_sp = 1;
+                }
                 nbr_pj->d = sqrt(nbr_pj->d);
                 flag = 1;
             }
 
-            if( flag ){    
+            if ( flag )
+            {
                 type_j = system->atoms[j].type;
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
@@ -596,65 +680,81 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
                 /* cubic spline interpolation */
                 r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
+                if ( r == 0 )  ++r;
+                base = (real)(r + 1) * t->dx;
                 dif = r_ij - base;
-                val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                    t->ele[r].a;
+                val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                      t->ele[r].a;
                 val *= EV_to_KCALpMOL / C_ele;
 
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * val;
+                H->j[Htop] = j;
+                H->val[Htop] = self_coef * val;
                 ++Htop;
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* H_sp matrix entry */
+                if ( flag_sp )
+                {
+                    H_sp->j[H_sp_top] = j;
+                    H_sp->val[H_sp_top] = H->val[Htop - 1];
+                    ++H_sp_top;
+                }
+
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     // fprintf( stderr, "%d %d\n", atom1, atom2 );
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
+                    if ( ihb == 1 && jhb == 2 )
+                    {
                         hbonds->select.hbond_list[ihb_top].nbr = j;
                         hbonds->select.hbond_list[ihb_top].scl = 1;
                         hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                         ++ihb_top;
                         ++num_hbonds;
                     }
-                    else if( ihb == 2 && jhb == 1 ) {
+                    else if ( ihb == 2 && jhb == 1 )
+                    {
                         jhb_top = End_Index( workspace->hbond_index[j], hbonds );
                         hbonds->select.hbond_list[jhb_top].nbr = i;
                         hbonds->select.hbond_list[jhb_top].scl = -1;
                         hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        Set_End_Index( workspace->hbond_index[j], jhb_top + 1, hbonds );
                         ++num_hbonds;
                     }
                 }
 
                 /* uncorrected bond orders */
-                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
+                {
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         num_bonds += 2;
                         /****** bonds i-j and j-i ******/
                         ibond = &( bonds->select.bond_list[btop_i] );
@@ -666,6 +766,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         ibond->d = r_ij;
                         jbond->d = r_ij;
                         rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        //fprintf (stderr, " %f - %f - %f \n", nbr_pj->dvec[0], nbr_pj->dvec[1], nbr_pj->dvec[2]);
                         rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
                         ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
                         ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
@@ -674,7 +775,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         ibond->sym_index = btop_j;
                         jbond->sym_index = btop_i;
                         ++btop_i;
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
 
                         bo_ij = &( ibond->bo_data );
                         bo_ji = &( jbond->bo_data );
@@ -688,22 +789,22 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
                         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
                         rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
                         rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
                         rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
                         rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
-                        /* Only dBOp wrt. dr_i is stored here, note that 
+                        /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ij->dBOp,
+                                    -(bo_ij->BO_s * Cln_BOp_s +
+                                      bo_ij->BO_pi * Cln_BOp_pi +
+                                      bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
                         rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
 
                         rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
@@ -718,30 +819,37 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
                         bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
 
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
                     }
                 }
             }
         }
 
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        /* diagonal entry */
+        H->j[Htop] = i;
+        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
         ++Htop;
 
+        /* diagonal entry */
+        H_sp->j[H_sp_top] = i;
+        H_sp->val[H_sp_top] = H->val[Htop - 1];
+        ++H_sp_top;
+
         Set_End_Index( i, btop_i, bonds );
-        if( ihb == 1 )
+        if ( ihb == 1 )
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
     }
 
     // mark the end of j list
-    H->start[i] = Htop; 
+    H->start[i] = Htop;
+    H_sp->start[i] = H_sp_top;
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, 
-            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+    Validate_Lists( workspace, lists,
+                    data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-            data->step, Htop, num_bonds, num_hbonds );
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
+             data->step, Htop, num_bonds, num_hbonds );
     //Print_Bonds( system, bonds, "sbonds.out" );
     //Print_Bond_List2( system, bonds, "sbonds.out" );
     //Print_Sparse_Matrix2( H, "H.out" );
@@ -749,9 +857,10 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 }
 
 
-void Estimate_Storage_Sizes( reax_system *system, control_params *control, 
-        list **lists, int *Htop, int *hb_top, 
-        int *bond_top, int *num_3body ) {
+void Estimate_Storage_Sizes( reax_system *system, control_params *control,
+                             list **lists, int *Htop, int *hb_top,
+                             int *bond_top, int *num_3body )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
@@ -759,7 +868,7 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
     real r_ij, r2;
     real C12, C34, C56;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2; 
+    real p_boc1, p_boc2;
     list *far_nbrs;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -770,7 +879,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
@@ -778,7 +888,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = sbp_i->p_hbond;
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
@@ -786,46 +897,53 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
             sbp_j = &(system->reaxprm.sbp[type_j]);
             twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
-            if( nbr_pj->d <= control->r_cut ) {
+            if ( nbr_pj->d <= control->r_cut )
+            {
                 ++(*Htop);
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0.1 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 )
+                    if ( ihb == 1 && jhb == 2 )
                         ++hb_top[i];
-                    else if( ihb == 2 && jhb == 1 )
+                    else if ( ihb == 2 && jhb == 1 )
                         ++hb_top[j];
                 }
 
                 /* uncorrected bond orders */
-                if( nbr_pj->d <= control->nbr_cut ) {
+                if ( nbr_pj->d <= control->nbr_cut )
+                {
                     r_ij = nbr_pj->d;
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         ++bond_top[i];
                         ++bond_top[j];
                     }
@@ -836,8 +954,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
 
     *Htop += system->N;
     *Htop *= SAFE_ZONE;
-
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
         *num_3body += SQR(bond_top[i]);
         bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
@@ -846,49 +964,40 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
 }
 
 
-void Compute_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list** lists, output_controls *out_control )
+void Compute_Forces( reax_system *system, control_params *control,
+                     simulation_data *data, static_storage *workspace,
+                     list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
 
     t_start = Get_Time( );
-    if( !control->tabulate )
+    if ( !control->tabulate )
+    {
         Init_Forces( system, control, data, workspace, lists, out_control );
-    else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    }
+    else
+    {
+        Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    }
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.init_forces += t_elapsed;
-
 #if defined(DEBUG_FOCUS)
-    print_sparse_matrix (system, workspace);
     fprintf( stderr, "init_forces - ");
 #endif
 
-
-    //analyze_hbonds (system, workspace, lists);
-
     t_start = Get_Time( );
     Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.bonded += t_elapsed;
-
-    //print_bond_list (system, workspace, lists);
-    //exit (0);
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "bonded_forces - ");
 #endif
 
     t_start = Get_Time( );
-    Compute_NonBonded_Forces( system, control, data, workspace, 
-            lists, out_control );
+    Compute_NonBonded_Forces( system, control, data, workspace,
+                              lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.nonb += t_elapsed;
-
-#ifdef __DEBUG_CUDA__
-    fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
-#endif
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonbondeds - ");
 #endif
@@ -904,7 +1013,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     Print_Total_Force( system, control, data, workspace, lists, out_control );
     Compare_Total_Forces( system, control, data, workspace, lists, out_control );
 #endif
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "forces - ");
 #endif
 }
diff --git a/PuReMD-GPU/src/forces.h b/PuReMD-GPU/src/forces.h
index 73323f0419baf383d6bf671158ef85584a710728..0ef8b117c78de5e7b3cdc2311b4a492b4615a859 100644
--- a/PuReMD-GPU/src/forces.h
+++ b/PuReMD-GPU/src/forces.h
@@ -23,12 +23,14 @@
 
 #include "mytypes.h"
 
+
 void Init_Bonded_Force_Functions( control_params* );
 
 void Compute_Forces( reax_system*, control_params*, simulation_data*,
-                     static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Estimate_Storage_Sizes( reax_system*, control_params*, list**,
-                             int*, int*, int*, int* );
+        int*, int*, int*, int* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/four_body_interactions.c b/PuReMD-GPU/src/four_body_interactions.c
index c51601fa991203a77ec4840c10e74e15cfa42c87..25642871d23e389a96db943607f432aa68252a02 100644
--- a/PuReMD-GPU/src/four_body_interactions.c
+++ b/PuReMD-GPU/src/four_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -22,20 +23,21 @@
 
 #include "bond_orders.h"
 #include "box.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
 #include "math.h"
-#include "index_utils.h"
 
+#define MIN_SINE 1e-10
 
 real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
-        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
-        three_body_interaction_data *p_ijk, 
-        three_body_interaction_data *p_jkl, 
-        rvec dcos_omega_di, rvec dcos_omega_dj, 
-        rvec dcos_omega_dk, rvec dcos_omega_dl, 
-        output_controls *out_control )
+                      rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+                      three_body_interaction_data *p_ijk,
+                      three_body_interaction_data *p_jkl,
+                      rvec dcos_omega_di, rvec dcos_omega_dj,
+                      rvec dcos_omega_dk, rvec dcos_omega_dl,
+                      output_controls *out_control )
 {
     real unnorm_cos_omega, unnorm_sin_omega, omega;
     real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
@@ -49,11 +51,11 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
     cos_jkl = COS( p_jkl->theta );
 
     /* omega */
-    unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) +
-        SQR( r_jk ) *  rvec_Dot( dvec_ij,dvec_kl );
+    unnorm_cos_omega = -rvec_Dot( dvec_ij, dvec_jk ) * rvec_Dot( dvec_jk, dvec_kl ) +
+                       SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
-    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
+    omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
 
     /* derivatives */
     /* coef for adjusments to cos_theta's */
@@ -70,24 +72,25 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
     hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
     hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
 
+
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 ) poem = 1e-20;
 
-    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - 
-        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
-                r_jk * r_kl * cos_jkl );
+    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) -
+           2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
+                   r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if( arg >  1.0 )
+    if ( arg >  1.0 )
     {
         arg =  1.0;
     }
-    if( arg < -1.0 )
+    if ( arg < -1.0 )
     {
         arg = -1.0;
     }
 
-    /*fprintf( out_control->etor, 
+    /*fprintf( out_control->etor,
       "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
@@ -99,69 +102,72 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
       r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
-      r_ij, r_jk, r_kl, r_li ); 
-      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", 
-      cos_ijk, cos_jkl, sin_ijk, sin_jkl ); 
+      r_ij, r_jk, r_kl, r_li );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      cos_ijk, cos_jkl, sin_ijk, sin_jkl );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
       poem, tel, arg );*/
     /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-       -p_ijk->dcos_dk[0]/sin_ijk, 
-       -p_ijk->dcos_dk[1]/sin_ijk, 
+       -p_ijk->dcos_dk[0]/sin_ijk,
+       -p_ijk->dcos_dk[1]/sin_ijk,
        -p_ijk->dcos_dk[2]/sin_ijk );
        fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-       -p_jkl->dcos_dk[0]/sin_jkl, 
-       -p_jkl->dcos_dk[1]/sin_jkl, 
+       -p_jkl->dcos_dk[0]/sin_jkl,
+       -p_jkl->dcos_dk[1]/sin_jkl,
        -p_jkl->dcos_dk[2]/sin_jkl );*/
 
-    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
     {
         sin_ijk = MIN_SINE;
     }
-    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
     {
         sin_ijk = -MIN_SINE;
     }
-    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
     {
         sin_jkl = MIN_SINE;
     }
-    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
     {
         sin_jkl = -MIN_SINE;
     }
 
     // dcos_omega_di
-    rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
-    rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
+    rvec_ScaledSum( dcos_omega_di, (htra - arg * hnra) / r_ij, dvec_ij, -1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_di, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_dk );
     rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
 
     // dcos_omega_dj
-    rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
-            -htrb / r_jk, dvec_jk );
-    rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj );
-    rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di );
+    rvec_ScaledSum( dcos_omega_dj, -(htra - arg * hnra) / r_ij, dvec_ij,
+                    -htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dj, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_dj );
+    rvec_ScaledAdd( dcos_omega_dj, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_di );
     rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
 
     // dcos_omega_dk
-    rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl,  
-            htrb / r_jk, dvec_jk );
-    rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di );
-    rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj );
+    rvec_ScaledSum( dcos_omega_dk, -(htrc - arg * hnrc) / r_kl, dvec_kl,
+                    htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dk, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_di );
+    rvec_ScaledAdd( dcos_omega_dk, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_dj );
     rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
 
     // dcos_omega_dl
-    rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li );
-    rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk );
+    rvec_ScaledSum( dcos_omega_dl, (htrc - arg * hnrc) / r_kl, dvec_kl, 1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_dl, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_dk );
     rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
 
-    return omega;  
+    return omega;
     //return arg;
 }
 
 
-void Four_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+
+
+void Four_Body_Interactions( reax_system *system, control_params *control,
+                             simulation_data *data, static_storage *workspace,
+                             list **lists, output_controls *out_control )
 {
     int i, j, k, l, pi, pj, pk, pl, pij, plk;
     int type_i, type_j, type_k, type_l;
@@ -212,31 +218,35 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
     list *thb_intrs = (*lists) + THREE_BODIES;
 
 
-    for( j = 0; j < system->N; ++j ) {
+    for ( j = 0; j < system->N; ++j )
+    {
         type_j = system->atoms[j].type;
         Delta_j = workspace->Delta_boc[j];
         start_j = Start_Index(j, bonds);
         end_j = End_Index(j, bonds);
 
 
-        for( pk = start_j; pk < end_j; ++pk ) {
+        for ( pk = start_j; pk < end_j; ++pk )
+        {
             pbond_jk = &( bonds->select.bond_list[pk] );
             k = pbond_jk->nbr;
             bo_jk = &( pbond_jk->bo_data );
             BOA_jk = bo_jk->BO - control->thb_cut;
 
             /* see if there are any 3-body interactions involving j&k
-               where j is the central atom. Otherwise there is no point in
-               trying to form a 4-body interaction out of this neighborhood */    
-            if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
-                    Num_Entries(pk, thb_intrs) ) {
+            where j is the central atom. Otherwise there is no point in
+             trying to form a 4-body interaction out of this neighborhood */
+            if ( j < k && bo_jk->BO > control->thb_cut/*0*/ &&
+                    Num_Entries(pk, thb_intrs) )
+            {
                 start_k = Start_Index(k, bonds);
-                end_k = End_Index(k, bonds);                   
+                end_k = End_Index(k, bonds);
                 pj = pbond_jk->sym_index; // pj points to j on k's list
 
-                /* do the same check as above: are there any 3-body interactions 
+                /* do the same check as above: are there any 3-body interactions
                    involving k&j where k is the central atom */
-                if( Num_Entries(pj, thb_intrs) ) {
+                if ( Num_Entries(pj, thb_intrs) )
+                {
                     type_k = system->atoms[k].type;
                     Delta_k = workspace->Delta_boc[k];
                     r_jk = pbond_jk->d;
@@ -244,7 +254,7 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                     start_pk = Start_Index(pk, thb_intrs );
                     end_pk = End_Index(pk, thb_intrs );
                     start_pj = Start_Index(pj, thb_intrs );
-                    end_pj = End_Index(pj, thb_intrs );        
+                    end_pj = End_Index(pj, thb_intrs );
 
                     exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
                     exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
@@ -255,14 +265,16 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                     /* pick i up from j-k interaction where j is the centre atom */
-                    for( pi = start_pk; pi < end_pk; ++pi ) {
+                    for ( pi = start_pk; pi < end_pk; ++pi )
+                    {
                         p_ijk = &( thb_intrs->select.three_body_list[pi] );
                         pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
                         pbond_ij = &( bonds->select.bond_list[pij] );
                         bo_ij = &( pbond_ij->bo_data );
 
 
-                        if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                        if ( bo_ij->BO > control->thb_cut/*0*/ )
+                        {
                             i = p_ijk->thb;
                             type_i = system->atoms[i].type;
                             r_ij = pbond_ij->d;
@@ -272,17 +284,18 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                             sin_ijk = SIN( theta_ijk );
                             cos_ijk = COS( theta_ijk );
                             //tan_ijk_i = 1. / TAN( theta_ijk );
-                            if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                            if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
                                 tan_ijk_i = cos_ijk / MIN_SINE;
-                            else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                            else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
                                 tan_ijk_i = cos_ijk / -MIN_SINE;
                             else tan_ijk_i = cos_ijk / sin_ijk;
 
                             exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
 
                             /* pick l up from j-k intr. where k is the centre */
-                            for( pl = start_pj; pl < end_pj; ++pl ) {
+                            for ( pl = start_pj; pl < end_pj; ++pl )
+                            {
                                 p_jkl = &( thb_intrs->select.three_body_list[pl] );
                                 l = p_jkl->thb;
                                 plk = p_jkl->pthb; //pointer to l on k's bond_list!
@@ -292,8 +305,9 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                 fbh = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types ) ]);
                                 fbp = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types )].prm[0]);
 
-                                if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                if ( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
+                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ )
+                                {
                                     ++num_frb_intrs;
                                     r_kl = pbond_kl->d;
                                     BOA_kl = bo_kl->BO - control->thb_cut;
@@ -302,77 +316,77 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     sin_jkl = SIN( theta_jkl );
                                     cos_jkl = COS( theta_jkl );
                                     //tan_jkl_i = 1. / TAN( theta_jkl );
-                                    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
                                         tan_jkl_i = cos_jkl / MIN_SINE;
-                                    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
                                         tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl /sin_jkl;
+                                    else tan_jkl_i = cos_jkl / sin_jkl;
 
-                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
-                                            &(system->box), dvec_li );
+                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x,
+                                                       &(system->box), dvec_li );
                                     r_li = rvec_Norm( dvec_li );
 
 
                                     /* omega and its derivative */
-                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
-                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
-                                            r_jk, pbond_kl->dvec, r_kl,
-                                            dvec_li, r_li, p_ijk, p_jkl,
-                                            dcos_omega_di, dcos_omega_dj,
-                                            dcos_omega_dk, dcos_omega_dl,
-                                            out_control);
+                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec,
+                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec,
+                                                            r_jk, pbond_kl->dvec, r_kl,
+                                                            dvec_li, r_li, p_ijk, p_jkl,
+                                                            dcos_omega_di, dcos_omega_dj,
+                                                            dcos_omega_dk, dcos_omega_dl,
+                                                            out_control);
                                     cos_omega = COS( omega );
                                     cos2omega = COS( 2. * omega );
                                     cos3omega = COS( 3. * omega );
                                     /* end omega calculations */
 
                                     /* torsion energy */
-                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
+                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2. - bo_jk->BO_pi - f11_DjDk));
                                     exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
-                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-                                        (1.0 - exp_tor2_kl);
-
-                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-                                            fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-                                            fbp->V3 * (1.0 + cos3omega) );
-                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
+                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
+                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
+                                           (1.0 - exp_tor2_kl);
+
+                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) +
+                                                 fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                                 fbp->V3 * (1.0 + cos3omega) );
+                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) +
                                     //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
                                     //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
 
                                     data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
 
                                     dfn11 = (-p_tor3 * exp_tor3_DjDk +
-                                            (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-                                            (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+                                             (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                             (2. + exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
 
                                     CEtors1 = sin_ijk * sin_jkl * CV;
 
-                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
-                                        (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
-                                        sin_ijk * sin_jkl; 
+                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
+                                              (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
+                                              sin_ijk * sin_jkl;
 
                                     CEtors3 = CEtors2 * dfn11;
 
-                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-                                        (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij *
+                                              (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
 
-                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk *
+                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
 
                                     CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
 
                                     cmn = -fn10 * CV;
                                     CEtors7 = cmn * sin_jkl * tan_ijk_i;
                                     CEtors8 = cmn * sin_ijk * tan_jkl_i;
-                                    CEtors9 = fn10 * sin_ijk * sin_jkl * 
-                                        (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                         1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                    CEtors9 = fn10 * sin_ijk * sin_jkl *
+                                              (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                               1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
                                     //cmn = -fn10 * CV;
                                     //CEtors7 = cmn * sin_jkl * cos_ijk;
                                     //CEtors8 = cmn * sin_ijk * cos_jkl;
-                                    //CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    //CEtors9 = fn10 * sin_ijk * sin_jkl *
                                     //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
                                     //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
                                     /* end  of torsion energy */
@@ -380,38 +394,38 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
                                     /* 4-body conjugation energy */
                                     fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                    data->E_Con += e_con = fbp->p_cot1 * fn12 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    data->E_Con += e_con = fbp->p_cot1 * fn12 *
+                                                           (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
 
-                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
+                                            (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
 
                                     CEconj1 = Cconj * (BOA_ij - 1.5e0);
                                     CEconj2 = Cconj * (BOA_jk - 1.5e0);
                                     CEconj3 = Cconj * (BOA_kl - 1.5e0);
 
-                                    CEconj4 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-                                    CEconj5 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-                                    //CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    CEconj4 = -fbp->p_cot1 * fn12 *
+                                              (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                    CEconj5 = -fbp->p_cot1 * fn12 *
+                                              (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                    //CEconj4 = -fbp->p_cot1 * fn12 *
                                     //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-                                    //CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    //CEconj5 = -fbp->p_cot1 * fn12 *
                                     //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-                                        cos_omega * sin_ijk * sin_jkl;
+                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 *
+                                              cos_omega * sin_ijk * sin_jkl;
                                     /* end 4-body conjugation energy */
 
                                     //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
                                     //   workspace->orig_id[i], workspace->orig_id[j],
-                                    //       workspace->orig_id[k], workspace->orig_id[l], 
+                                    //       workspace->orig_id[k], workspace->orig_id[l],
                                     //    omega, cos_omega, cos2omega, cos3omega );
-                                    //fprintf(stdout, 
+                                    //fprintf(stdout,
                                     //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    CEtors2, CEtors3, CEtors4, CEtors5, 
+                                    //    CEtors2, CEtors3, CEtors4, CEtors5,
                                     //    CEtors6, CEtors7, CEtors8, CEtors9 );
                                     //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    theta_ijk, theta_jkl, sin_ijk, 
+                                    //    theta_ijk, theta_jkl, sin_ijk,
                                     //    sin_jkl, cos_jkl, tan_jkl_i );
 
                                     /* forces */
@@ -420,37 +434,38 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     workspace->CdDelta[k] += CEtors3;
                                     bo_ij->Cdbo += (CEtors4 + CEconj1);
                                     bo_jk->Cdbo += (CEtors5 + CEconj2);
-
                                     bo_kl->Cdbo += (CEtors6 + CEconj3);
 
-                                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                    if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                                    {
                                         /* dcos_theta_ijk */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[i].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_di );
 
                                         /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[l].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_dk );
 
                                         /* dcos_omega */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dl );
+                                        rvec_ScaledAdd( system->atoms[i].f,
+                                                        CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_ScaledAdd( system->atoms[l].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dl );
                                     }
-                                    else {
+                                    else
+                                    {
                                         ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
 
                                         /* dcos_theta_ijk */
@@ -459,8 +474,8 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                         rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                         rvec_Add( data->ext_press, ext_press );
 
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
 
                                         rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
                                         rvec_Add( system->atoms[k].f, force );
@@ -469,8 +484,8 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                                         /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
 
                                         rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
                                         rvec_Add( system->atoms[k].f, force );
@@ -483,14 +498,14 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                         rvec_Add( data->ext_press, ext_press );
 
 
-                                        /* dcos_omega */                      
+                                        /* dcos_omega */
                                         rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
                                         rvec_Add( system->atoms[i].f, force );
                                         rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                         rvec_Add( data->ext_press, ext_press );
 
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dj );
 
                                         rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
                                         rvec_Add( system->atoms[k].f, force );
@@ -504,39 +519,39 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                                         /* This part is intended for a fully-flexible box */
-                                        /* rvec_ScaledSum( temp_rvec, 
-                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
+                                        /* rvec_ScaledSum( temp_rvec,
+                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i
                                            CEtors9 + CEconj6, dcos_omega_di );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[i].x );
                                            rtensor_Copy( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
                                            CEtors8 + CEconj5, p_jkl->dcos_di );
-                                           rvec_ScaledAdd( temp_rvec, 
+                                           rvec_ScaledAdd( temp_rvec,
                                            CEtors9 + CEconj6, dcos_omega_dj );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[j].x );
                                            rtensor_Add( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors7 + CEconj4, p_ijk->dcos_di,      // k
                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                           rvec_ScaledAdd( temp_rvec, 
+                                           rvec_ScaledAdd( temp_rvec,
                                            CEtors9 + CEconj6, dcos_omega_dk );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[k].x );
                                            rtensor_Add( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
                                            CEtors9 + CEconj6, dcos_omega_dl );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[l].x );
                                            rtensor_Copy( total_rtensor, temp_rtensor );
 
-                                           if( pbond_ij->imaginary || pbond_jk->imaginary || 
+                                           if( pbond_ij->imaginary || pbond_jk->imaginary ||
                                            pbond_kl->imaginary )
                                            rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
                                            else
@@ -544,82 +559,82 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     }
 
 #ifdef TEST_ENERGY
-                                    /*fprintf( out_control->etor, 
-                                    //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //r_ij, r_jk, r_kl, 
-                                    "%12.8f%12.8f%12.8f%12.8f\n",
-                                    cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                    /*fprintf( out_control->etor,
+                                       //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                       //r_ij, r_jk, r_kl,
+                                       "%12.8f%12.8f%12.8f%12.8f\n",
+                                       cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
                                     // fprintf( out_control->etor, "%12.8f\n", dfn11 );
-                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
-                                            fn10, cos_omega, CV );
+                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n",
+                                             fn10, cos_omega, CV );
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEtors2, CEtors3, CEtors4, CEtors5, 
-                                            CEtors6, CEtors7, CEtors8, CEtors9 );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                             CEtors2, CEtors3, CEtors4, CEtors5,
+                                             CEtors6, CEtors7, CEtors8, CEtors9 );
 
-                                    /* fprintf( out_control->etor, 
+                                    /* fprintf( out_control->etor,
                                        "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
                                        htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                             CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
                                     /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
                                        fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
 
-                                    fprintf( out_control->etor, 
-                                            //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
-                                            "%6d%6d%6d%6d%12.8f%12.8f\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            e_tor, e_con );
+                                    fprintf( out_control->etor,
+                                             //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n",
+                                             "%6d%6d%6d%6d%12.8f%12.8f\n",
+                                             workspace->orig_id[i], workspace->orig_id[j],
+                                             workspace->orig_id[k], workspace->orig_id[l],
+                                             e_tor, e_con );
                                     //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
 
-                                    fprintf( out_control->econ, 
-                                            "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-                                            e_con,data->E_Con );
-
-                                    /* fprintf( out_control->etor, 
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
+                                    fprintf( out_control->econ,
+                                             "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                             workspace->orig_id[i], workspace->orig_id[j],
+                                             workspace->orig_id[k], workspace->orig_id[l],
+                                             RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl,
+                                             e_con, data->E_Con );
+
+                                    /* fprintf( out_control->etor,
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
 
 
-                                    /* fprintf( out_control->etor, 
+                                    /* fprintf( out_control->etor,
                                        "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1],
                                        (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                            dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
-                                            dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
-                                            dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-                                            dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                             dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2],
+                                             dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2],
+                                             dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                             dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
 #endif
 
 #ifdef TEST_FORCES
-                                    // Torsion Forces 
-                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
-                                            workspace->f_tor, workspace->f_tor);
+                                    /* Torsion Forces */
+                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0.,
+                                                  workspace->f_tor, workspace->f_tor);
                                     Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
                                     Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
                                     Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
@@ -639,7 +654,7 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
                                     rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
 
-                                    // Conjugation Forces 
+                                    /* Conjugation Forces */
                                     Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
                                     Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
                                     Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
@@ -666,12 +681,12 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
         } // pk loop ends
     } // j loop
 
-    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
+    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
 #ifdef TEST_FORCES
     fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
-    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
-            data->E_Tor, data->E_Con );
+    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n",
+             data->E_Tor, data->E_Con );
 #endif
 }
diff --git a/PuReMD-GPU/src/four_body_interactions.h b/PuReMD-GPU/src/four_body_interactions.h
index 8e8dd7c0991a747000e77b2d460711e433db52ef..65e315a94f95239ad8c7081a00c32cc0d3264cd3 100644
--- a/PuReMD-GPU/src/four_body_interactions.h
+++ b/PuReMD-GPU/src/four_body_interactions.h
@@ -23,10 +23,9 @@
 
 #include "mytypes.h"
 
-#define MIN_SINE 1e-10
-
 
 void Four_Body_Interactions( reax_system*, control_params*, simulation_data*,
-    static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/geo_tools.c b/PuReMD-GPU/src/geo_tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..f3c3bd48140f9fd46ac3e460a0ab4bd33ade85b9
--- /dev/null
+++ b/PuReMD-GPU/src/geo_tools.c
@@ -0,0 +1,797 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include <ctype.h>
+
+#include "geo_tools.h"
+#include "allocate.h"
+#include "box.h"
+#include "list.h"
+#include "restart.h"
+#include "tool_box.h"
+#include "vector.h"
+
+
+/********************* geo format routines ******************/
+void Count_Geo_Atoms( FILE *geo, reax_system *system )
+{
+    int i, serial;
+    rvec x;
+    char element[3], name[9], line[MAX_LINE + 1];
+
+    /* total number of atoms */
+    fscanf( geo, " %d", &(system->N) );
+
+    /* count atoms */
+    for ( i = 0; i < system->N; ++i )
+    {
+        fscanf( geo, CUSTOM_ATOM_FORMAT,
+                &serial, element, name, &x[0], &x[1], &x[2] );
+        Fit_to_Periodic_Box( &(system->box), &x );
+    }
+
+    fseek( geo, 0, SEEK_SET ); // set the pointer to the beginning of the file
+    fgets( line, MAX_LINE, geo );
+    fgets( line, MAX_LINE, geo );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "N = %d\n\n", system->N );
+#endif
+}
+
+
+char Read_Geo( char* geo_file, reax_system* system, control_params *control,
+        simulation_data *data, static_storage *workspace )
+{
+
+    FILE *geo;
+    char descriptor[9];
+    int i, serial, top;
+    real box_x, box_y, box_z, alpha, beta, gamma;
+    rvec x;
+    char element[3], name[9];
+    reax_atom *atom;
+
+    /* open the geometry file */
+    if ( (geo = fopen(geo_file, "r")) == NULL )
+    {
+        fprintf( stderr, "Error opening the geo file! terminating...\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* read box information */
+    fscanf( geo, CUSTOM_BOXGEO_FORMAT,
+            descriptor, &box_x, &box_y, &box_z, &alpha, &beta, &gamma );
+    /* initialize the box */
+    Setup_Box( box_x, box_y, box_z, alpha, beta, gamma, &(system->box) );
+
+    /* count my atoms & allocate storage */
+    Count_Geo_Atoms( geo, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* read in my atom info */
+    top = 0;
+    for ( i = 0; i < system->N; ++i )
+    {
+        fscanf( geo, CUSTOM_ATOM_FORMAT,
+                &serial, element, name, &x[0], &x[1], &x[2] );
+        Fit_to_Periodic_Box( &(system->box), &x );
+#if defined(DEBUG)
+        fprintf( stderr, "atom%d: %s %s %f %f %f\n",
+                 serial, element, name, x[0], x[1], x[2] );
+#endif
+
+        atom = &(system->atoms[top]);
+        workspace->orig_id[i] = serial;
+        atom->type = Get_Atom_Type( &(system->reaxprm), element );
+        strcpy( atom->name, name );
+        rvec_Copy( atom->x, x );
+        rvec_MakeZero( atom->v );
+        rvec_MakeZero( atom->f );
+        atom->q = 0.;
+
+        top++;
+    }
+
+    fclose( geo );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "finished reading the geo file\n" );
+#endif
+
+    return SUCCESS;
+}
+
+
+int Read_Box_Info( reax_system *system, FILE *geo, int geo_format )
+{
+    char *cryst;
+    char  line[MAX_LINE + 1];
+    char  descriptor[9];
+    char  s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
+    char  s_group[12], s_zValue[12];
+
+    /* initialize variables */
+    fseek( geo, 0, SEEK_SET ); // set the pointer to the beginning of the file
+
+    switch ( geo_format )
+    {
+        case PDB:
+            cryst = "CRYST1";
+            break;
+        default:
+            cryst = "BOX";
+    }
+
+    /* locate the cryst line in the geo file, read it and
+       initialize the big box */
+    while ( fgets( line, MAX_LINE, geo ) )
+    {
+        if ( strncmp( line, cryst, 6 ) == 0 )
+        {
+            if ( geo_format == PDB )
+                sscanf( line, PDB_CRYST1_FORMAT,
+                        &descriptor[0],
+                        &s_a[0], &s_b[0], &s_c[0],
+                        &s_alpha[0], &s_beta[0], &s_gamma[0],
+                        &s_group[0], &s_zValue[0] );
+
+            /* compute full volume tensor from the angles */
+            Setup_Box( atof(s_a),  atof(s_b), atof(s_c),
+                    atof(s_alpha), atof(s_beta), atof(s_gamma),
+                    &(system->box) );
+            return SUCCESS;
+        }
+    }
+    if ( ferror( geo ) )
+    {
+        return FAILURE;
+    }
+
+    return FAILURE;
+}
+
+
+void Count_PDB_Atoms( FILE *geo, reax_system *system )
+{
+    char *endptr = NULL;
+    char line[MAX_LINE + 1];
+    char s_x[9], s_y[9], s_z[9];
+    rvec x;
+
+    /* initialize variables */
+    fseek( geo, 0, SEEK_SET ); /* set the pointer to the beginning of the file */
+    system->N = 0;
+
+    /* increment number of atoms for each line denoting an atom desc */
+    while ( fgets( line, MAX_LINE, geo ) )
+    {
+        if ( strncmp( line, "ATOM", 4 ) == 0 ||
+                strncmp( line, "HETATM", 6 ) == 0 )
+        {
+            system->N++;
+
+            strncpy( s_x, line + 30, 8 );
+            s_x[8] = 0;
+            strncpy( s_y, line + 38, 8 );
+            s_y[8] = 0;
+            strncpy( s_z, line + 46, 8 );
+            s_z[8] = 0;
+            Make_Point( strtod( s_x, &endptr ), strtod( s_y, &endptr ),
+                        strtod( s_z, &endptr ), &x );
+            Fit_to_Periodic_Box( &(system->box), &x );
+        }
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "count atoms:\n" );
+    fprintf( stderr, "N = %d\n\n", system->N );
+#endif
+}
+
+
+char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
+               simulation_data *data, static_storage *workspace )
+{
+
+    FILE  *pdb;
+    char **tmp;
+    char  *s, *s1;
+    char   descriptor[9], serial[9];
+    char   atom_name[9], res_name[9], res_seq[9];
+    char   s_x[9], s_y[9], s_z[9];
+    char   occupancy[9], temp_factor[9];
+    char   seg_id[9], element[9], charge[9];
+    char   alt_loc, chain_id, icode;
+    char  *endptr = NULL;
+    int    i, c, c1, pdb_serial, top;
+    rvec   x;
+    reax_atom *atom;
+
+    /* open pdb file */
+    if ( (pdb = fopen(pdb_file, "r")) == NULL )
+    {
+        fprintf( stderr, "fopen: error opening the pdb file! terminating...\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* allocate memory for tokenizing pdb lines */
+    if ( Allocate_Tokenizer_Space( &s, &s1, &tmp ) == FAILURE )
+    {
+        fprintf( stderr, "Allocate_Tokenizer_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* read box information */
+    if ( Read_Box_Info( system, pdb, PDB ) == FAILURE )
+    {
+        fprintf( stderr, "Read_Box_Info: no CRYST line in the pdb file!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INVALID_GEO );
+    }
+
+    Count_PDB_Atoms( pdb, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* start reading and processing the pdb file */
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "starting to read the pdb file\n" );
+#endif
+    fseek( pdb, 0, SEEK_SET );
+    c  = 0;
+    c1 = 0;
+    top = 0;
+    s[0] = 0;
+
+    while ( fgets( s, MAX_LINE, pdb ) )
+    {
+        /* read new line and tokenize it */
+        strncpy( s1, s, MAX_LINE - 1 );
+        c1 = Tokenize( s, &tmp );
+
+        /* process new line */
+        if ( strncmp(tmp[0], "ATOM", 4) == 0 || strncmp(tmp[0], "HETATM", 6) == 0 )
+        {
+            if ( strncmp(tmp[0], "ATOM", 4) == 0 )
+            {
+                strncpy( &descriptor[0], s1, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], s1 + 6, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], s1 + 12, 4 );
+                atom_name[4] = 0;
+                //strncpy( &serial[0], s1+6, 7 );       serial[7] = 0;
+                //strncpy( &atom_name[0], s1+13, 3 );   atom_name[3] = 0;
+                alt_loc = s1[16];
+                strncpy( &res_name[0], s1 + 17, 3 );
+                res_name[3] = 0;
+                chain_id = s1[21];
+                strncpy( &res_seq[0], s1 + 22, 4 );
+                res_seq[4] = 0;
+                icode = s1[26];
+                strncpy( &s_x[0], s1 + 30, 8 );
+                s_x[8] = 0;
+                strncpy( &s_y[0], s1 + 38, 8 );
+                s_y[8] = 0;
+                strncpy( &s_z[0], s1 + 46, 8 );
+                s_z[8] = 0;
+                strncpy( &occupancy[0], s1 + 54, 6 );
+                occupancy[6] = 0;
+                strncpy( &temp_factor[0], s1 + 60, 6 );
+                temp_factor[6] = 0;
+                strncpy( &seg_id[0], s1 + 72, 4 );
+                seg_id[4] = 0;
+                strncpy( &element[0], s1 + 76, 2 );
+                element[2] = 0;
+                strncpy( &charge[0], s1 + 78, 2 );
+                charge[2] = 0;
+            }
+            else if (strncmp(tmp[0], "HETATM", 6) == 0)
+            {
+                strncpy( &descriptor[0], s1, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], s1 + 6, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], s1 + 12, 4 );
+                atom_name[4] = 0;
+                //strncpy( &serial[0], s1+6, 7 );       serial[7] = 0;
+                //strncpy( &atom_name[0], s1+13, 3 );   atom_name[3] = 0;
+                alt_loc = s1[16];
+                strncpy( &res_name[0], s1 + 17, 3 );
+                res_name[3] = 0;
+                chain_id = s1[21];
+                strncpy( &res_seq[0], s1 + 22, 4 );
+                res_seq[4] = 0;
+                icode = s1[26];
+                strncpy( &s_x[0], s1 + 30, 8 );
+                s_x[8] = 0;
+                strncpy( &s_y[0], s1 + 38, 8 );
+                s_y[8] = 0;
+                strncpy( &s_z[0], s1 + 46, 8 );
+                s_z[8] = 0;
+                strncpy( &occupancy[0], s1 + 54, 6 );
+                occupancy[6] = 0;
+                strncpy( &temp_factor[0], s1 + 60, 6 );
+                temp_factor[6] = 0;
+                //strncpy( &seg_id[0], s1+72, 4 );      seg_id[4] = 0;
+                strncpy( &element[0], s1 + 76, 2 );
+                element[2] = 0;
+                strncpy( &charge[0], s1 + 78, 2 );
+                charge[2] = 0;
+            }
+
+            /* if the point is inside my_box, add it to my lists */
+            Make_Point( strtod( &s_x[0], &endptr ),
+                        strtod( &s_y[0], &endptr ),
+                        strtod( &s_z[0], &endptr ), &x );
+
+            Fit_to_Periodic_Box( &(system->box), &x );
+
+            /* store orig_id, type, name and coord info of the new atom */
+            atom = &(system->atoms[top]);
+            pdb_serial = (int) strtod( &serial[0], &endptr );
+            workspace->orig_id[top] = pdb_serial;
+
+            Trim_Spaces( element );
+            atom->type = Get_Atom_Type( &(system->reaxprm), element );
+            strcpy( atom->name, atom_name );
+
+            rvec_Copy( atom->x, x );
+            rvec_MakeZero( atom->v );
+            rvec_MakeZero( atom->f );
+            atom->q = 0;
+
+            top++;
+            // fprintf( stderr, "p%d: %6d%2d x:%8.3f%8.3f%8.3f"
+            //                  "q:%8.3f occ:%s temp:%s seg:%s elmnt:%s\n",
+            //       system->my_rank,
+            //       c, system->my_atoms[top].type,
+            //       system->my_atoms[top].x[0],
+            //       system->my_atoms[top].x[1],
+            //       system->my_atoms[top].x[2],
+            //       system->my_atoms[top].q, occupancy, temp_factor,
+            //       seg_id, element );
+
+            //fprintf( stderr, "atom( %8.3f %8.3f %8.3f ) --> p%d\n",
+            // system->my_atoms[top].x[0], system->my_atoms[top].x[1],
+            // system->my_atoms[top].x[2], system->my_rank );
+
+            c++;
+        }
+
+        /* IMPORTANT: We do not check for the soundness of restrictions here.
+           When atom2 is on atom1's restricted list, and there is a restriction
+           on atom2, then atom1 has to be on atom2's restricted list, too.
+           However, we do not check if this is the case in the input file,
+           this is upto the user. */
+        else if (!strncmp( tmp[0], "CONECT", 6 ))
+        {
+            if ( control->restrict_bonds )
+            {
+                /* error check */
+                // Check_Input_Range( c1 - 2, 0, MAX_RESTRICT,
+                // "CONECT line exceeds max num restrictions allowed.\n" );
+
+                /* read bond restrictions */
+                // if( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[1]) ) )
+                //   ratom = workspace->map_serials[ pdb_serial ];
+
+                // workspace->restricted[ ratom ] = c1 - 2;
+                // for( i = 2; i < c1; ++i )
+                //  {
+                //    if( is_Valid_Serial(workspace, pdb_serial = atoi(tmp[i])) )
+                //        workspace->restricted_list[ ratom ][ i-2 ] =
+                //          workspace->map_serials[ pdb_serial ];
+                //  }
+
+                // fprintf( stderr, "restriction on %d:", ratom );
+                // for( i = 0; i < workspace->restricted[ ratom ]; ++i )
+                // fprintf( stderr, "  %d",
+                //          workspace->restricted_list[ratom][i] );
+                // fprintf( stderr, "\n" );
+            }
+        }
+
+        /* clear previous input line */
+        s[0] = 0;
+        for ( i = 0; i < c1; ++i )
+            tmp[i][0] = 0;
+    }
+    if ( ferror( pdb ) )
+    {
+        return FAILURE;
+    }
+
+    fclose( pdb );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "finished reading the pdb file\n" );
+#endif
+
+    return SUCCESS;
+} 
+
+
+/* PDB serials are written without regard to the order, we'll see if this
+   cause trouble, if so we'll have to rethink this approach
+   Also, we do not write connect lines yet.
+*/
+char Write_PDB( reax_system* system, list* bonds, simulation_data *data,
+        control_params *control, static_storage *workspace, output_controls *out_control )
+{
+    int i, buffer_req, buffer_len;
+    //int j, connect[4];
+    char name[8];
+    //real bo;
+    real alpha, beta, gamma;
+    reax_atom *p_atom;
+    char fname[MAX_STR];
+    char *line;
+    char *buffer;
+    FILE *pdb;
+
+    /* Allocation */
+    line = (char*) smalloc( sizeof(char) * PDB_ATOM_FORMAT_O_LENGTH, "geo:line" );
+    buffer_req = system->N * PDB_ATOM_FORMAT_O_LENGTH;
+
+    buffer = (char*) smalloc( sizeof(char) * buffer_req, "geo:buffer" );
+
+    pdb = NULL;
+    line[0] = 0;
+    buffer[0] = 0;
+    /* Writing Box information */
+    gamma = ACOS( (system->box.box[0][0] * system->box.box[1][0] +
+                   system->box.box[0][1] * system->box.box[1][1] +
+                   system->box.box[0][2] * system->box.box[1][2]) /
+                  (system->box.box_norms[0] * system->box.box_norms[1]) );
+    beta  = ACOS( (system->box.box[0][0] * system->box.box[2][0] +
+                   system->box.box[0][1] * system->box.box[2][1] +
+                   system->box.box[0][2] * system->box.box[2][2]) /
+                  (system->box.box_norms[0] * system->box.box_norms[2]) );
+    alpha = ACOS( (system->box.box[2][0] * system->box.box[1][0] +
+                   system->box.box[2][1] * system->box.box[1][1] +
+                   system->box.box[2][2] * system->box.box[1][2]) /
+                  (system->box.box_norms[2] * system->box.box_norms[1]) );
+
+    /*open pdb and write header*/
+    sprintf(fname, "%s-%d.pdb", control->sim_name, data->step);
+    pdb = fopen(fname, "w");
+    fprintf( pdb, PDB_CRYST1_FORMAT_O,
+             "CRYST1",
+             system->box.box_norms[0], system->box.box_norms[1],
+             system->box.box_norms[2],
+             RAD2DEG(alpha), RAD2DEG(beta), RAD2DEG(gamma), " ", 0 );
+    fprintf( out_control->log, "Box written\n" );
+    fflush( out_control->log );
+
+    /*write atom lines to buffer*/
+    for ( i = 0; i < system->N; i++)
+    {
+        p_atom = &(system->atoms[i]);
+        strncpy(name, p_atom->name, 8);
+        Trim_Spaces(name);
+        sprintf( line, PDB_ATOM_FORMAT_O,
+                 "ATOM  ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ',
+                 p_atom->x[0], p_atom->x[1], p_atom->x[2],
+                 1.0, 0.0, "0", name, "  " );
+        fprintf( stderr, "PDB NAME <%s>\n", p_atom->name );
+        strncpy( buffer + i * PDB_ATOM_FORMAT_O_LENGTH, line,
+                 PDB_ATOM_FORMAT_O_LENGTH );
+    }
+
+    buffer_len = system->N * PDB_ATOM_FORMAT_O_LENGTH;
+    buffer[buffer_len] = 0;
+
+    fprintf( pdb, "%s", buffer );
+    fclose( pdb );
+
+    /* Writing connect information */
+    /*
+    for(i=0; i < system->N; i++) {
+      count = 0;
+      for(j = Start_Index(i, bonds); j < End_Index(i, bonds); ++j) {
+        bo = bonds->bond_list[j].bo_data.BO;
+        if (bo > 0.3) {
+          connect[count] = bonds->bond_list[j].nbr+1;
+          count++;
+        }
+      }
+
+      fprintf( out_control->pdb, "%6s%5d", "CONECT", i+1 );
+      for( k=0; k < count; k++ )
+        fprintf( out_control->pdb, "%5d", connect[k] );
+      fprintf( out_control->pdb, "\n" );
+    }
+    */
+
+    free(buffer);
+    free(line);
+
+    return SUCCESS;
+}
+
+
+char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
+               simulation_data *data, static_storage *workspace )
+{
+    FILE *bgf;
+    char **tokens;
+    char *line, *backup;
+    char descriptor[10], serial[10];
+    char atom_name[10], res_name[10], res_seq[10];
+    char s_x[12], s_y[12], s_z[12];
+    char occupancy[10], temp_factor[10];
+    char element[10], charge[10];
+    char chain_id;
+    char s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
+    char *endptr = NULL;
+    int  i, atom_cnt, token_cnt, bgf_serial, ratom = 0;
+
+    /* open biograf file */
+    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the bgf file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* allocate memory for tokenizing biograf file lines */
+    line   = (char*)  malloc( sizeof(char)  * MAX_LINE );
+    backup = (char*)  malloc( sizeof(char)  * MAX_LINE );
+    tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS );
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        tokens[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
+    }
+
+    /* count number of atoms in the pdb file */
+    system->N = 0;
+    line[0] = 0;
+
+    while ( fgets( line, MAX_LINE, bgf ) )
+    {
+        tokens[0][0] = 0;
+        token_cnt = Tokenize( line, &tokens );
+
+        if ( !strcmp( tokens[0], "ATOM" ) || !strcmp( tokens[0], "HETATM" ) )
+        {
+            (system->N)++;
+        }
+
+        line[0] = 0;
+    }
+    if ( ferror ( bgf ) )
+    {
+        return FAILURE;
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "system->N: %d\n", system->N );
+#endif
+
+    fclose( bgf );
+
+    /* memory allocations for atoms, atom maps, bond restrictions */
+//    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+//
+//    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
+//    for ( i = 0; i < MAX_ATOM_ID; ++i )
+//    {
+//        workspace->map_serials[i] = -1;
+//    }
+//
+//    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
+//    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
+//    workspace->restricted_list = (int**) calloc( system->N, sizeof(int*) );
+//    for ( i = 0; i < system->N; ++i )
+//    {
+//        workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
+//    }
+
+    //TODO: setup similar for BGF
+//    Count_PDB_Atoms( pdb, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* start reading and processing bgf file */
+    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the bgf file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+    atom_cnt = 0;
+    token_cnt = 0;
+
+    while ( fgets( line, MAX_LINE, bgf ) )
+    {
+        /* read new line and tokenize it */
+        strncpy( backup, line, MAX_LINE - 1 );
+        token_cnt = Tokenize( line, &tokens );
+
+        /* process new line */
+        if ( !strncmp(tokens[0], "ATOM", 4) || !strncmp(tokens[0], "HETATM", 6) )
+        {
+            if ( !strncmp(tokens[0], "ATOM", 4) )
+            {
+                strncpy( &descriptor[0], backup, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], backup + 7, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], backup + 13, 5 );
+                atom_name[5] = 0;
+                strncpy( &res_name[0], backup + 19, 3 );
+                res_name[3] = 0;
+                chain_id = backup[23];
+                strncpy( &res_seq[0], backup + 25, 5 );
+                res_seq[5] = 0;
+                strncpy( &s_x[0], backup + 30, 10 );
+                s_x[10] = 0;
+                strncpy( &s_y[0], backup + 40, 10 );
+                s_y[10] = 0;
+                strncpy( &s_z[0], backup + 50, 10 );
+                s_z[10] = 0;
+                strncpy( &element[0], backup + 61, 5 );
+                element[5] = 0;
+                strncpy( &occupancy[0], backup + 66, 3 );
+                occupancy[3] = 0;
+                strncpy( &temp_factor[0], backup + 69, 2 );
+                temp_factor[2] = 0;
+                strncpy( &charge[0], backup + 72, 8 );
+                charge[8] = 0;
+            }
+            else if ( !strncmp(tokens[0], "HETATM", 6) )
+            {
+                /* bgf hetatm:
+                   (7x,i5,1x,a5,1x,a3,1x,a1,1x,a5,3f10.5,1x,a5,i3,i2,1x,f8.5) */
+                strncpy( &descriptor[0], backup, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], backup + 7, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], backup + 13, 5 );
+                atom_name[5] = 0;
+                strncpy( &res_name[0], backup + 19, 3 );
+                res_name[3] = 0;
+                chain_id = backup[23];
+                strncpy( &res_seq[0], backup + 25, 5 );
+                res_seq[5] = 0;
+                strncpy( &s_x[0], backup + 30, 10 );
+                s_x[10] = 0;
+                strncpy( &s_y[0], backup + 40, 10 );
+                s_y[10] = 0;
+                strncpy( &s_z[0], backup + 50, 10 );
+                s_z[10] = 0;
+                strncpy( &element[0], backup + 61, 5 );
+                element[5] = 0;
+                strncpy( &occupancy[0], backup + 66, 3 );
+                occupancy[3] = 0;
+                strncpy( &temp_factor[0], backup + 69, 2 );
+                temp_factor[2] = 0;
+                strncpy( &charge[0], backup + 72, 8 );
+                charge[8] = 0;
+            }
+
+            /* add to mapping */
+            bgf_serial = strtod( &serial[0], &endptr );
+            Check_Input_Range( bgf_serial, 0, MAX_ATOM_ID, "Invalid bgf serial" );
+            workspace->map_serials[ bgf_serial ] = atom_cnt;
+            workspace->orig_id[ atom_cnt ] = bgf_serial;
+            // fprintf( stderr, "map %d --> %d\n", bgf_serial, atom_cnt );
+
+            /* copy atomic positions */
+            system->atoms[atom_cnt].x[0] = strtod( &s_x[0], &endptr );
+            system->atoms[atom_cnt].x[1] = strtod( &s_y[0], &endptr );
+            system->atoms[atom_cnt].x[2] = strtod( &s_z[0], &endptr );
+
+            /* atom name and type */
+            strcpy( system->atoms[atom_cnt].name, atom_name );
+            Trim_Spaces( element );
+            system->atoms[atom_cnt].type =
+                Get_Atom_Type( &(system->reaxprm), element );
+
+            /* fprintf( stderr,
+            "a:%3d(%1d) c:%10.5f%10.5f%10.5f q:%10.5f occ:%s temp:%s seg_id:%s element:%s\n",
+             atom_cnt, system->atoms[ atom_cnt ].type,
+             system->atoms[ atom_cnt ].x[0],
+             system->atoms[ atom_cnt ].x[1], system->atoms[ atom_cnt ].x[2],
+             system->atoms[ atom_cnt ].q, occupancy, temp_factor,
+             seg_id, element ); */
+
+            atom_cnt++;
+        }
+        else if (!strncmp( tokens[0], "CRYSTX", 6 ))
+        {
+            sscanf( backup, BGF_CRYSTX_FORMAT,
+                    &descriptor[0],
+                    &s_a[0],
+                    &s_b[0],
+                    &s_c[0],
+                    &s_alpha[0],
+                    &s_beta[0],
+                    &s_gamma[0] );
+
+            /* Compute full volume tensor from the angles */
+            Setup_Box( atof(s_a),  atof(s_b), atof(s_c),
+                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
+                                 &(system->box) );
+        }
+        else if (!strncmp( tokens[0], "CONECT", 6 ))
+        {
+            /* check number of restrictions */
+            Check_Input_Range( token_cnt - 2, 0, MAX_RESTRICT,
+                               "CONECT line exceeds max restrictions allowed.\n" );
+
+            /* read bond restrictions */
+            if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[1]) ) )
+            {
+                ratom = workspace->map_serials[ bgf_serial ];
+            }
+
+            workspace->restricted[ ratom ] = token_cnt - 2;
+            for ( i = 2; i < token_cnt; ++i )
+            {
+                if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[i]) ) )
+                {
+                    workspace->restricted_list[ ratom * system->N + (i - 2) ] =
+                        workspace->map_serials[ bgf_serial ];
+                }
+            }
+
+            /* fprintf( stderr, "restriction on %d:", ratom );
+            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
+             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
+             fprintf( stderr, "\n" ); */
+        }
+
+        /* clear previous input line */
+        line[0] = 0;
+
+        for ( i = 0; i < token_cnt; ++i )
+        {
+            tokens[i][0] = 0;
+        }
+    }
+    if ( ferror ( bgf ) )
+    {
+        return FAILURE;
+    }
+
+    fclose( bgf );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "bgf file read\n" );
+#endif
+
+    return SUCCESS;
+}
diff --git a/PuReMD-GPU/src/pdb_tools.h b/PuReMD-GPU/src/geo_tools.h
similarity index 84%
rename from PuReMD-GPU/src/pdb_tools.h
rename to PuReMD-GPU/src/geo_tools.h
index 12518fc2daaecd735f5cb0f781a0f1c72e504aa7..4c44e3081e6105947b610916e95210e123b4a7d9 100644
--- a/PuReMD-GPU/src/pdb_tools.h
+++ b/PuReMD-GPU/src/geo_tools.h
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -18,13 +19,20 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __PDB_TOOLS_H_
-#define __PDB_TOOLS_H_
+#ifndef __GEO_TOOLS_H_
+#define __GEO_TOOLS_H_
 
 #include "mytypes.h"
 
-/*
-PDB format :
+// CUSTOM_BOXGEO: BOXGEO box_x box_y box_z  angle1 angle2 angle3
+#define CUSTOM_BOXGEO_FORMAT " %s %lf %lf %lf %lf %lf %lf"
+// CUSTOM ATOM: serial element name x y z
+#define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf"
+
+char Read_Geo( char*, reax_system*, control_params*,
+        simulation_data*, static_storage* );
+
+/* PDB format :
 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html
 
 #define PDB_ATOM_FORMAT   "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%4s%2s%2s\n"
@@ -94,24 +102,28 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 67 - 70      Integer         z             Z value
 */
 
-//#define PDB_ATOM_FORMAT "ATOM  %4d%4s%c%3s%c%4d%c%8.3f%8.3f%8.3f%6.2f%6.2f%-4s%2s%2s\n"
+//#define PDB_ATOM_FORMAT
+//"ATOM  %4d%4s%c%3s%c%4d%c%8.3f%8.3f%8.3f%6.2f%6.2f%-4s%2s%2s\n"
 
 #define PDB_ATOM_FORMAT   "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%4s%2s%2s\n"
+#define PDB_ATOM_FORMAT_LENGTH 71
 #define PDB_HETATM_FORMAT "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%2s%2s\n"
 #define PDB_CONECT_FORMAT "%6s%5d%5d%5d%5d%5d\n"
 #define PDB_CRYST1_FORMAT "%6s%9s%9s%9s%7s%7s%7s%11s%4s\n"
 
 #define PDB_ATOM_FORMAT_O "%6s%5d %4s%c%3s %c%4d%c   %8.3f%8.3f%8.3f%6.2f%6.2f      %-4s%2s%2s\n"
+#define PDB_ATOM_FORMAT_O_LENGTH 81
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
 #define BGF_CRYSTX_FORMAT "%8s%11s%11s%11s%11s%11s%11s"
 
 char Read_PDB( char*, reax_system*, control_params*,
-               simulation_data*, static_storage* );
+        simulation_data*, static_storage* );
+
 char Read_BGF( char*, reax_system*, control_params*,
-               simulation_data*, static_storage* );
+        simulation_data*, static_storage* );
 
-char Write_PDB( reax_system*, control_params*, simulation_data*,
-                static_storage*, list*, output_controls* );
+char Write_PDB( reax_system*, list*, simulation_data*,
+        control_params*, static_storage*, output_controls* );
 
 #endif
diff --git a/PuReMD-GPU/src/grid.c b/PuReMD-GPU/src/grid.c
index fb09b409194a84b1646da3b779aad8b547ff9db3..2077d56080799f3674eecb3038a0f30bd318bd67 100644
--- a/PuReMD-GPU/src/grid.c
+++ b/PuReMD-GPU/src/grid.c
@@ -1,28 +1,29 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "grid.h"
 
+#include "index_utils.h"
 #include "reset_utils.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
 int Estimate_GCell_Population( reax_system* system )
@@ -34,36 +35,45 @@ int Estimate_GCell_Population( reax_system* system )
     g = &( system->g );
     Reset_Grid( g );
 
-    for( l = 0; l < system->N; l++ ) {
+    for ( l = 0; l < system->N; l++ )
+    {
         i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
         j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
         k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
         g->top[index_grid_3d (i, j, k, g)]++;
-        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
+        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n",
         // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
         // i, j, k );
     }
 
     max_atoms = 0;
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ )
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 if( max_atoms < g->top[index_grid_3d (i, j, k, g)] )
+                {
                     max_atoms = g->top[index_grid_3d (i, j, k, g)];  
+                }
+            }
+        }
+    }
 
-    return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); 
+    return MAX(max_atoms * SAFE_ZONE, MIN_GCELL_POPL);
 }
 
 
 void Allocate_Space_for_Grid( reax_system *system )
 {
     int i, j, k, l;
-    grid *g = &(system->g);
-
-    int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
+    grid *g;
+    int total;
 
     g = &(system->g);
-    g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; 
+    g->max_nbrs = (2 * g->spread[0] + 1) * (2 * g->spread[1] + 1) * (2 * g->spread[2] + 1) + 3;
+    total = g->ncell[0] * g->ncell[1] * g->ncell[2];
 
     /* allocate space for the new grid */
     g->top = (int*) calloc( total, sizeof( int ));
@@ -73,10 +83,14 @@ void Allocate_Space_for_Grid( reax_system *system )
     g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec ));
     g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec ));
 
-    for( i = 0; i < g->ncell[0]; i++ ) {
-        for( j = 0; j < g->ncell[1]; j++ ) {
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                for( l = 0; l < g->max_nbrs; ++l ){ 
+    for( i = 0; i < g->ncell[0]; i++ )
+    {
+        for( j = 0; j < g->ncell[1]; j++ )
+        {
+            for( k = 0; k < g->ncell[2]; k++ )
+            {
+                for( l = 0; l < g->max_nbrs; ++l )
+                { 
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
@@ -110,49 +124,74 @@ int Shift(int p, int dp, int dim, grid *g )
     int dim_len = 0;
     int newp = p + dp;
 
-    switch( dim ) {
-        case 0: dim_len = g->ncell[0];
-            break;
-        case 1: dim_len = g->ncell[1];
-            break;
-        case 2: dim_len = g->ncell[2];
+    switch ( dim )
+    {
+    case 0:
+        dim_len = g->ncell[0];
+        break;
+    case 1:
+        dim_len = g->ncell[1];
+        break;
+    case 2:
+        dim_len = g->ncell[2];
+    }
+
+    while ( newp < 0 )
+    {
+        newp = newp + dim_len;
+    }
+    while ( newp >= dim_len )
+    {
+        newp = newp - dim_len;
     }
 
-    while( newp < 0 )        newp = newp + dim_len;
-    while( newp >= dim_len ) newp = newp - dim_len;
     return newp;
 }
 
 
 /* finds the closest point between two grid cells denoted by c1 and c2.
    periodic boundary conditions are taken into consideration as well. */
-void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z, 
-        int c2x, int c2y, int c2z, rvec closest_point )
+void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z,
+                         int c2x, int c2y, int c2z, rvec closest_point )
 {
     int  i, d;
     ivec c1 = { c1x, c1y, c1z };
     ivec c2 = { c2x, c2y, c2z };
 
-    for( i = 0; i < 3; i++ ) {
-        if( g->ncell[i] < 5 ) {
+    for ( i = 0; i < 3; i++ )
+    {
+        if ( g->ncell[i] < 5 )
+        {
             closest_point[i] = NEG_INF - 1.;
             continue;
         }
 
         d = c2[i] - c1[i];
-        if( abs(d) <= g->ncell[i] / 2 ) {
-            if( d > 0 )
+        if ( abs(d) <= g->ncell[i] / 2 )
+        {
+            if ( d > 0 )
+            {
                 closest_point[i] = c2[i] * g->len[i];
+            }
             else if ( d == 0 )
+            {
                 closest_point[i] = NEG_INF - 1.;
+            }
             else
+            {
                 closest_point[i] = ( c2[i] + 1 ) * g->len[i];
+            }
         }
-        else {
-            if( d > 0 )
+        else
+        {
+            if ( d > 0 )
+            {
                 closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i];
-            else    
+            }
+            else
+            {
                 closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i];
+            }
         }
     }
 }
@@ -168,29 +207,36 @@ void Find_Neighbor_GridCells( grid *g )
     rvec *cp_stack;
 
     /* pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
                 cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
                 stack_top = 0;
                 //fprintf( stderr, "grid1: %d %d %d\n", i, j, k );
 
                 /* choose an unmarked neighbor cell*/
-                for( di = -g->spread[0]; di <= g->spread[0]; di++ ) {
+                for ( di = -g->spread[0]; di <= g->spread[0]; di++ )
+                {
                     x = Shift( i, di, 0, g );
 
-                    for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) {
+                    for ( dj = -g->spread[1]; dj <= g->spread[1]; dj++ )
+                    {
                         y = Shift( j, dj, 1, g );
 
-                        for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) {
+                        for ( dk = -g->spread[2]; dk <= g->spread[2]; dk++ )
+                        {
                             z = Shift( k, dk, 2, g );
                             //fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z );
 
-                            if( !g->mark[ index_grid_3d (x, y, z, g) ] ) {
+                            if( !g->mark[ index_grid_3d (x, y, z, g) ] )
+                            {
                                 /*(di < 0 || // 9 combinations
-                                  (di == 0 && dj < 0) || // 3 combinations
-                                  (di == 0 && dj == 0 && dk < 0) ) )*/ 
+                                 (di == 0 && dj < 0) || // 3 combinations
+                                 (di == 0 && dj == 0 && dk < 0) ) )*/
                                 /* put the neighbor cell into the stack and mark it */
                                 nbrs_stack[stack_top][0] = x;
                                 nbrs_stack[stack_top][1] = y;
@@ -198,8 +244,8 @@ void Find_Neighbor_GridCells( grid *g )
                                 g->mark[ index_grid_3d(x,y,z,g) ] = 1;
 
                                 Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] );
-                                //fprintf( stderr, "\tcp: %lf %lf %lf\n", 
-                                // cp_stack[stack_top][0], cp_stack[stack_top][1], 
+                                //fprintf( stderr, "\tcp: %lf %lf %lf\n",
+                                // cp_stack[stack_top][0], cp_stack[stack_top][1],
                                 // cp_stack[stack_top][2]);
                                 stack_top++;
                             }
@@ -220,6 +266,8 @@ void Find_Neighbor_GridCells( grid *g )
                 nbrs_stack[stack_top][2] = -1;
                 Reset_Marks( g, nbrs_stack, stack_top );
             }
+        }
+    }
 }
 
 
@@ -234,9 +282,13 @@ void Setup_Grid( reax_system* system )
     /* determine number of grid cells in each direction */
     ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
 
-    for( d = 0; d < 3; ++d )
-        if( ncell[d] <= 0 )
+    for ( d = 0; d < 3; ++d )
+    {
+        if ( ncell[d] <= 0 )
+        {
             ncell[d] = 1;
+        }
+    }
 
     /* find the number of grid cells */
     g->total = ncell[0] * ncell[1] * ncell[2];
@@ -270,25 +322,34 @@ void Update_Grid( reax_system* system )
     /* determine number of grid cells in each direction */
     ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
 
-    for( d = 0; d < 3; ++d )
-        if( ncell[d] == 0 )
+    for ( d = 0; d < 3; ++d )
+    {
+        if ( ncell[d] == 0 )
+        {
             ncell[d] = 1;
+        }
+    }
 
-    if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */
+    if ( ivec_isEqual( ncell, g->ncell ) ) /* ncell are unchanged */
+    {
         /* update cell lengths */
         rvec_iDivide( g->len, my_box->box_norms, g->ncell );
         rvec_Invert( g->inv_len, g->len );
 
         /* update closest point distances between gcells */
-        for( i = 0; i < g->ncell[0]; i++ )
-            for( j = 0; j < g->ncell[1]; j++ )
-                for( k = 0; k < g->ncell[2]; k++ ) {
+        for ( i = 0; i < g->ncell[0]; i++ )
+        {
+            for ( j = 0; j < g->ncell[1]; j++ )
+            {
+                for ( k = 0; k < g->ncell[2]; k++ )
+                {
                     nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
                     nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
                     //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
@@ -297,9 +358,12 @@ void Update_Grid( reax_system* system )
                         ++itr;
                     }
                 }
+            }
+        }
     }
-    else{  /* at least one of ncell has changed */
-        Deallocate_Grid_Space( g );    
+    else   /* at least one of ncell has changed */
+    {
+        Deallocate_Grid_Space( g );
         /* update number of grid cells */
         g->total = ncell[0] * ncell[1] * ncell[2];
         ivec_Copy( g->ncell, ncell );
@@ -311,10 +375,10 @@ void Update_Grid( reax_system* system )
         Find_Neighbor_GridCells( g );
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "updated grid: " );
-        fprintf( stderr, "ncell[%d %d %d] ", 
-                g->ncell[0], g->ncell[1], g->ncell[2] );
-        fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", 
-                g->len[0], g->len[1], g->len[2] );
+        fprintf( stderr, "ncell[%d %d %d] ",
+                 g->ncell[0], g->ncell[1], g->ncell[2] );
+        fprintf( stderr, "len[%5.2f %5.2f %5.2f] ",
+                 g->len[0], g->len[1], g->len[2] );
         fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
 #endif
     }
@@ -328,40 +392,59 @@ void Bin_Atoms( reax_system* system, static_storage *workspace )
     grid *g = &( system->g );
 
     Reset_Grid( g );
-
-    for( l = 0; l < system->N; l++ ) {
+    for ( l = 0; l < system->N; l++ )
+    {
         i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
         j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
         k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
 
 #ifdef __BNVT_FIX__
-        if (i >= g->ncell[0]) i = g->ncell[0]-1;
-        if (j >= g->ncell[1]) j = g->ncell[1]-1;
-        if (k >= g->ncell[2]) k = g->ncell[2]-1;
+        if (i >= g->ncell[0])
+        {
+            i = g->ncell[0]-1;
+        }
+        if (j >= g->ncell[1])
+        {
+            j = g->ncell[1]-1;
+        }
+        if (k >= g->ncell[2])
+        {
+            k = g->ncell[2]-1;
+        }
 #endif
 
         g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l;
         g->top[index_grid_3d (i,j,k,g) ]++;
 
-        //fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
-        //l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
-        //i, j, k );
+        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n",
+        // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
+        // i, j, k );
     }
 
     max_atoms = 0;
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ )
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] )
+                {
                     max_atoms = g->top[ index_grid_3d (i, j, k, g) ];  
+                }
+            }
+        }
+    }
 
     /* check if current gcell->max_atoms is safe */
-    if( max_atoms >= g->max_atoms * SAFE_ZONE ) 
-        workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); 
+    if ( max_atoms >= g->max_atoms * SAFE_ZONE )
+    {
+        workspace->realloc.gcell_atoms = MAX(max_atoms * SAFE_ZONE, MIN_GCELL_POPL);
+    }
 }
 
 
-inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
+static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 {
     dest->type = src->type;
     rvec_Copy( dest->x, src->x );
@@ -370,30 +453,37 @@ inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 }
 
 
-void Copy_Storage( reax_system *system, static_storage *workspace, 
-        int top, int old_id, int old_type, 
-        int *num_H, real *v, real *s, real *t, 
-        int *orig_id, rvec *f_old )
+void Copy_Storage( reax_system *system, static_storage *workspace,
+                   int top, int old_id, int old_type,
+                   int *num_H, real *v, real *s, real *t,
+                   int *orig_id, rvec *f_old )
 {
     int i;
 
-    for( i = 0; i < RESTART+1; ++i )
+    for ( i = 0; i < RESTART + 1; ++i )
+    {
         v[ index_wkspace_sys (i,top, system->N) ] = workspace->v[ index_wkspace_sys (i,old_id, system->N) ];
+    }
 
-    for( i = 0; i < 3; ++i ) {
-        s[ index_wkspace_sys (i,top, system->N) ] = workspace->s[ index_wkspace_sys (i,old_id, system->N) ];
-        t[ index_wkspace_sys (i,top, system->N) ] = workspace->t[ index_wkspace_sys (i,old_id, system->N) ];
+    for ( i = 0; i < 3; ++i )
+    {
+        s[ index_wkspace_sys(i,top, system->N) ] = workspace->s[ index_wkspace_sys(i,old_id, system->N) ];
+        t[ index_wkspace_sys(i,top, system->N) ] = workspace->t[ index_wkspace_sys(i,old_id, system->N) ];
     }
 
     orig_id[top]  = workspace->orig_id[old_id];
 
-    workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta;
     workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi;
-    workspace->b_t[top] = -1.0;          
+    workspace->b_t[top] = -1.0;
 
-    if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
+    if ( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
+    {
         workspace->hbond_index[top] = (*num_H)++;
-    else workspace->hbond_index[top] = -1;
+    }
+    else
+    {
+        workspace->hbond_index[top] = -1;
+    }
 
     rvec_Copy( f_old[top], workspace->f_old[old_id] );
 }
@@ -404,12 +494,12 @@ void Free_Storage( static_storage *workspace )
     free( workspace->v );
     free( workspace->s );
     free( workspace->t );
-    free( workspace->orig_id );  
+    free( workspace->orig_id );
 }
 
 
-void Assign_New_Storage( static_storage *workspace, 
-        real *v, real *s, real *t, 
+void Assign_New_Storage( static_storage *workspace,
+        real *v, real *s, real *t,
         int *orig_id, rvec *f_old )
 {
     workspace->v = v;
@@ -425,14 +515,20 @@ void Assign_New_Storage( static_storage *workspace,
 
 void Cluster_Atoms( reax_system *system, static_storage *workspace )
 {
-    int         i, j, k, l, top, old_id, num_H = 0;
+    int         i, j, k, l, top, old_id, num_H;
     reax_atom  *old_atom;
-    grid       *g = &( system->g );
-    reax_atom  *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-    int        *orig_id = (int  *) calloc( system->N, sizeof( int ) );
+    grid       *g;
+    reax_atom  *new_atoms;
+    int        *orig_id ;
     real       *v;
     real       *s, *t;
-    rvec       *f_old = (rvec*) calloc( system->N, sizeof(rvec) );
+    rvec       *f_old;
+
+    num_H = 0;
+    g = &( system->g );
+    new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    orig_id = (int  *) calloc( system->N, sizeof( int ) );
+    f_old = (rvec*) calloc( system->N, sizeof(rvec) );
 
     s = (real*) calloc( 3, sizeof( real ) * system->N );
     t = (real*) calloc( 3, sizeof( real ) * system->N );
@@ -440,24 +536,30 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace )
 
     top = 0;
 
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 g->start[ index_grid_3d (i, j, k, g) ] = top;
 
-                for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) {
+                for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l )
+                {
                     old_id   = g->atoms[ index_grid_atoms (i, j, k, l, g) ];
                     old_atom = &( system->atoms[old_id] );
                     // fprintf( stderr, "%d <-- %d\n", top, old_id );
 
                     reax_atom_Copy( &(new_atoms[top]), old_atom );
-                    Copy_Storage( system, workspace, top, old_id, old_atom->type, 
-                            &num_H, v, s, t, orig_id, f_old );
+                    Copy_Storage( system, workspace, top, old_id, old_atom->type,
+                                  &num_H, v, s, t, orig_id, f_old );
                     ++top;
                 }
 
                 g->end[ index_grid_3d (i, j, k, g) ] = top;
             }
+        }
+    }
 
 
     free( system->atoms );
diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c
index 2a2ce1270e2c694722e489b9a3f38f8dd48177a1..d1b40c6a224208ba3b24e00682b6872be8c16752 100644
--- a/PuReMD-GPU/src/init_md.c
+++ b/PuReMD-GPU/src/init_md.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -25,8 +26,8 @@
 #include "forces.h"
 #include "grid.h"
 #include "index_utils.h"
-#include "lin_alg.h"
 #include "integrate.h"
+#include "lin_alg.h"
 #include "neighbors.h"
 #include "list.h"
 #include "lookup.h"
@@ -34,21 +35,20 @@
 #include "reset_utils.h"
 #include "system_props.h"
 #include "traj.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-void Generate_Initial_Velocities(reax_system *system, real T )
+void Generate_Initial_Velocities( reax_system *system, real T )
 {
     int i;
     real scale, norm;
 
 
-    if( T <= 0.1 )
+    if ( T <= 0.1 )
     {
-        for ( i = 0; i < system->N; i++ )
-        {
+        for (i = 0; i < system->N; i++)
             rvec_MakeZero( system->atoms[i].v );
-        }
 
 #if defined(DEBUG)
         fprintf( stderr, "no random velocities...\n" );
@@ -56,73 +56,74 @@ void Generate_Initial_Velocities(reax_system *system, real T )
     }
     else
     {
-        for( i = 0; i < system->N; i++ )
+        for ( i = 0; i < system->N; i++ )
         {
             rvec_Random( system->atoms[i].v );
 
             norm = rvec_Norm_Sqr( system->atoms[i].v );
-            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
-                    norm / (3.0 * K_B * T) );
+            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass *
+                          norm / (3.0 * K_B * T) );
 
-            rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
+            rvec_Scale( system->atoms[i].v, 1.0 / scale, system->atoms[i].v );
 
-            /*
-               fprintf( stderr, "v = %f %f %f\n", 
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-               fprintf( stderr, "scale = %f\n", scale );
-               fprintf( stderr, "v = %f %f %f\n",
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-             */
+            /*fprintf( stderr, "v = %f %f %f\n",
+            system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+            fprintf( stderr, "scale = %f\n", scale );
+            fprintf( stderr, "v = %f %f %f\n",
+            system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);*/
         }
     }
 }
 
 
-void Init_System( reax_system *system, control_params *control, 
+void Init_System( reax_system *system, control_params *control,
         simulation_data *data )
 {
     int i;
     rvec dx;
 
-    if( !control->restart )
+    if ( !control->restart )
     {
         Reset_Atoms( system );
     }
 
     Compute_Total_Mass( system, data );
-
     Compute_Center_of_Mass( system, data, stderr );
 
     /* reposition atoms */
     // just fit the atoms to the periodic box
-    if( control->reposition_atoms == 0 )
+    if ( control->reposition_atoms == 0 )
     {
         rvec_MakeZero( dx );
     }
     // put the center of mass to the center of the box
-    else if( control->reposition_atoms == 1 )
+    else if ( control->reposition_atoms == 1 )
     {
         rvec_Scale( dx, 0.5, system->box.box_norms );
         rvec_ScaledAdd( dx, -1., data->xcm );
     }
     // put the center of mass to the origin
-    else if( control->reposition_atoms == 2 ) {
+    else if ( control->reposition_atoms == 2 )
+    {
         rvec_Scale( dx, -1., data->xcm );
     }
-    else {
+    else
+    {
         fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
         exit( UNKNOWN_OPTION );
     }
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
-          i, system->atoms[i].type, 
+        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n",
+          i, system->atoms[i].type,
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
     }
 
     /* Initialize velocities so that desired init T can be attained */
-    if( !control->restart || (control->restart && control->random_vel) )  {
+    if ( !control->restart || (control->restart && control->random_vel) )
+    {
         Generate_Initial_Velocities( system, control->T_init );
     }
 
@@ -130,96 +131,153 @@ void Init_System( reax_system *system, control_params *control,
 }
 
 
-void Init_Simulation_Data( reax_system *system, control_params *control, 
-        simulation_data *data, output_controls *out_control, 
-        evolve_function *Evolve )
+void Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, output_controls *out_control, evolve_function *Evolve )
 {
 
     Reset_Simulation_Data( data );
 
-    if( !control->restart )  
+    if ( !control->restart )
+    {
         data->step = data->prev_steps = 0;
+    }
 
-    switch( control->ensemble ) {
-        case NVE:
-            data->N_f = 3 * system->N;
-            *Evolve = Velocity_Verlet_NVE;
-            break;
+    switch ( control->ensemble )
+    {
+    case NVE:
+        data->N_f = 3 * system->N;
+        *Evolve = Velocity_Verlet_NVE;
+        break;
 
 
-        case NVT:
-            data->N_f = 3 * system->N + 1;
-            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->therm.v_xi_old = 0;
-                data->therm.xi = 0;
+    case NVT:
+        data->N_f = 3 * system->N + 1;
+        //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+        if ( !control->restart || (control->restart && control->random_vel) )
+        {
+            data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin -
+                                                 data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->therm.v_xi_old = 0;
+            data->therm.xi = 0;
 #if defined(DEBUG_FOCUS)
-                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
-                        data->N_f, data->therm.v_xi );
+            fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                     data->therm.G_xi, control->Tau_T, data->E_Kin,
+                     data->N_f, data->therm.v_xi );
 #endif
-            }
+        }
 
-            *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-            break;
+        *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        break;
 
 
-        case NPT: // Anisotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->iso_bar.eps = 0.33333 * log(system->box.volume);
-                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-                //Compute_Pressure( system, data, workspace );
-            }
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
+    case NPT: // Anisotropic NPT
+        fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+        exit( UNKNOWN_OPTION );
+        data->N_f = 3 * system->N + 9;
+        if ( !control->restart )
+        {
+            data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin -
+                                                 data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->iso_bar.eps = 0.33333 * log(system->box.volume);
+            //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+            //Compute_Pressure( system, data, workspace );
+        }
+        *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+        break;
 
 
-        case sNPT: // Semi-Isotropic NPT
-            data->N_f = 3 * system->N + 4;
-            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-            break;
+    case sNPT: // Semi-Isotropic NPT
+        data->N_f = 3 * system->N + 4;
+        *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+        break;
 
 
-        case iNPT: // Isotropic NPT
-            data->N_f = 3 * system->N + 2;
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
+    case iNPT: // Isotropic NPT
+        data->N_f = 3 * system->N + 2;
+        *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+        break;
 
-        case bNVT: //berendensen NVT
-            data->N_f = 3 * system->N + 1; 
-            *Evolve = Velocity_Verlet_Berendsen_NVT;
-            break;
+    case bNVT:
+        data->N_f = 3 * system->N + 1;
+        *Evolve = Velocity_Verlet_Berendsen_NVT;
+        fprintf (stderr, " Initializing Velocity_Verlet_Berendsen_NVT .... \n");
+        break;
 
-        default:
-            break;
+    default:
+        break;
     }
 
     Compute_Kinetic_Energy( system, data );
 
-    /* init timing info for the host*/
+    /* init timing info */
     data->timing.start = Get_Time( );
     data->timing.total = data->timing.start;
     data->timing.nbrs = 0;
     data->timing.init_forces = 0;
     data->timing.bonded = 0;
     data->timing.nonb = 0;
-    data->timing.QEq = 0;
-    data->timing.matvecs = 0;
+    data->timing.QEq = ZERO;
+    data->timing.QEq_sort_mat_rows = ZERO;
+    data->timing.pre_comp = ZERO;
+    data->timing.pre_app = ZERO;
+    data->timing.solver_iters = 0;
+    data->timing.solver_spmv = ZERO;
+    data->timing.solver_vector_ops = ZERO;
+    data->timing.solver_orthog = ZERO;
+    data->timing.solver_tri_solve = ZERO;
 }
 
 
-void Init_Workspace( reax_system *system, control_params *control, 
+/* Initialize Taper params */
+void Init_Taper( control_params *control )
+{
+    real d1, d7;
+    real swa, swa2, swa3;
+    real swb, swb2, swb3;
+
+    swa = control->r_low;
+    swb = control->r_cut;
+
+    if ( fabs( swa ) > 0.01 )
+    {
+        fprintf( stderr, "Warning: non-zero value for lower Taper-radius cutoff\n" );
+    }
+
+    if ( swb < 0 )
+    {
+        fprintf( stderr, "Negative value for upper Taper-radius cutoff\n" );
+        exit( INVALID_INPUT );
+    }
+    else if ( swb < 5 )
+    {
+        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n",
+                swb );
+    }
+
+    d1 = swb - swa;
+    d7 = POW( d1, 7.0 );
+    swa2 = SQR( swa );
+    swa3 = CUBE( swa );
+    swb2 = SQR( swb );
+    swb3 = CUBE( swb );
+
+    control->Tap7 =  20.0 / d7;
+    control->Tap6 = -70.0 * (swa + swb) / d7;
+    control->Tap5 =  84.0 * (swa2 + 3.0 * swa * swb + swb2) / d7;
+    control->Tap4 = -35.0 * (swa3 + 9.0 * swa2 * swb + 9.0 * swa * swb2 + swb3 ) / d7;
+    control->Tap3 = 140.0 * (swa3 * swb + 3.0 * swa2 * swb2 + swa * swb3 ) / d7;
+    control->Tap2 = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
+    control->Tap1 = 140.0 * swa3 * swb3 / d7;
+    control->Tap0 = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
+                     7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+}
+
+
+void Init_Workspace( reax_system *system, control_params *control,
         static_storage *workspace )
-{  
+{
     int i;
 
     /* Allocate space for hydrogen bond list */
@@ -231,35 +289,27 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
     workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
 
-    workspace->Delta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta            = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp         = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp        = (real *) malloc( system->N * sizeof( real ) );
     workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp_temp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp              = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp_temp         = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Clp              = (real *) malloc( system->N * sizeof( real ) );
     workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->vlpex          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->vlpex            = (real *) malloc( system->N * sizeof( real ) );
 
     /* QEq storage */
-    //workspace->H        = NULL;
-    //workspace->L        = NULL;
-    //workspace->U        = NULL;
-    //
-    workspace->H.start        = NULL;
-    workspace->L.start        = NULL;
-    workspace->U.start        = NULL;
-
-    workspace->H.entries         = NULL;
-    workspace->L.entries         = NULL;
-    workspace->U.entries        = NULL;
-
+    workspace->H        = NULL;
+    workspace->H_sp     = NULL;
+    workspace->L        = NULL;
+    workspace->U        = NULL;
+    workspace->Hdia_inv = NULL;
     workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
     workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
     workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
     workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
     workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
@@ -273,25 +323,27 @@ void Init_Workspace( reax_system *system, control_params *control,
     // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
     // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
 
-    for( i = 0; i < system->N; ++i ) {
-        workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
+    for ( i = 0; i < system->N; ++i )
+    {
         workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
         workspace->b_t[i] = -1.0;
 
         workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b[i+system->N] = -1.0;
+        workspace->b[i + system->N] = -1.0;
     }
 
+    //TODO: conditionally allocate based on solver selection
     /* GMRES storage */
-    workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
-
-    workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
-    workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
-    workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
+    workspace->y  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    //TODO: unused?
+    workspace->z  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->g  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->h  = (real *)  calloc( (RESTART + 1) * (RESTART + 1), sizeof( real ) );
+    workspace->hs = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->hc = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    //TODO: unused?
+    workspace->rn = (real *)  calloc( (RESTART + 1) * system->N * 2, sizeof( real ) );
+    workspace->v  = (real *)  calloc( (RESTART + 1) * system->N, sizeof( real ) );
 
     /* CG storage */
     workspace->r = (real *) calloc( system->N, sizeof( real ) );
@@ -304,20 +356,25 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
     workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
 
-
     /* storage for analysis */
-    if( control->molec_anal || control->diffusion_coef )
+    if ( control->molec_anal || control->diffusion_coef )
     {
         workspace->mark = (int *) calloc( system->N, sizeof(int) );
         workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
     }
-    else 
+    else
+    {
         workspace->mark = workspace->old_mark = NULL;
+    }
 
-    if( control->diffusion_coef )
+    if ( control->diffusion_coef )
+    {
         workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
-    else workspace->x_old = NULL;
-
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
 #ifdef TEST_FORCES
     workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
@@ -344,9 +401,14 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->realloc.gcell_atoms = -1;
 
     Reset_Workspace( system, workspace );
+
+    /* Initialize Taper function */
+    Init_Taper( control );
 }
 
-void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N)
+
+void compare_far_neighbors( int *test, int *start, int *end,
+        far_neighbor_data *data, list *slist, int N )
 {
     int index = 0;
     int count = 0;
@@ -369,16 +431,19 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
        }
      */
 
-
-    for (i = 0; i < N; i++){
-        index = Start_Index (i, slist);
+    for (i = 0; i < N; i++)
+    {
+        index = Start_Index( i, slist );
         //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
 
-
-        for (j = start[i]; j < end[i]; j++){
+        for (j = start[i]; j < end[i]; j++)
+        {
             gpu = data[j];
 
-            if (i < data[j].nbr) continue;
+            if (i < data[j].nbr)
+            {
+                continue;
+            }
             /*
                if (i < data[j].nbr) {
             //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
@@ -386,7 +451,6 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             int dest = i;
             int x;
 
-
             for (x = start[src]; x < end[src]; x++) {
             if (data[x].nbr != dest) continue;
 
@@ -431,9 +495,11 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             cpu = slist->select.far_nbr_list[index];
             //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
             //if ( (gpu->d != cpu->d) ){
-            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
-                    (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-                    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d)
+                    ||(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1])
+                    || (cpu.dvec[2] != gpu.dvec[2]) || (cpu.rel_box[0] != gpu.rel_box[0])
+                    || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2]))
+            {
                 //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
                 //        (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
                 //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
@@ -457,16 +523,16 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
                 count ++;
             }
 
-        //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
-        index ++;
+            //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
+            index ++;
         }
 
-        if (index != End_Index (i, slist))
+        if (index != End_Index( i, slist ))
         {
             fprintf( stderr,
                 "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n",
                  i, index, Start_Index (i, slist), End_Index(i, slist),
-                    start[i], end[i]);
+                    start[i], end[i] );
             exit( 10 );
         }
     }
@@ -518,112 +584,118 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
 }
 
 
-void Init_Lists( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+void Init_Lists( reax_system *system, control_params *control,
+                 simulation_data *data, static_storage *workspace,
+                 list **lists, output_controls *out_control )
 {
     int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
     int *hb_top, *bond_top;
 
-    real t_start, t_elapsed;
-
     num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
-#endif
-
-    if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS ) ) {
+    if ( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS) )
+    {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
-#endif
-
-    t_start = Get_Time ();
-    Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
-    t_elapsed = Get_Timing_Info ( t_start );
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
+             num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024) );
 #endif
 
+    Generate_Neighbor_Lists(system, control, data, workspace, lists, out_control);
     Htop = 0;
     hb_top = (int*) calloc( system->N, sizeof(int) );
     bond_top = (int*) calloc( system->N, sizeof(int) );
     num_3body = 0;
-    Estimate_Storage_Sizes( system, control, lists, 
+    Estimate_Storage_Sizes( system, control, lists,
             &Htop, hb_top, bond_top, &num_3body );
 
-    Allocate_Matrix( &(workspace->H), system->N, Htop );
-
+    if ( Allocate_Matrix( workspace->H, system->N, Htop ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+    /* TODO: better estimate for H_sp?
+     *   If so, need to refactor Estimate_Storage_Sizes
+     *   to use various cut-off distances as parameters
+     *   (non-bonded, hydrogen, 3body, etc.) */
+    if ( Allocate_Matrix( workspace->H_sp, system->N, Htop ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
-    fprintf( stderr, "memory allocated: H = %ldMB\n", 
-            Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
+    fprintf( stderr, "memory allocated: H = %ldMB\n",
+            Htop * sizeof(sparse_matrix_entry) / (1024 * 1024) );
 #endif
 
     workspace->num_H = 0;
-    if( control->hb_cut > 0 ) {
+    if ( control->hb_cut > 0 )
+    {
         /* init H indexes */
-        for( i = 0; i < system->N; ++i )
-            if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
+        for ( i = 0; i < system->N; ++i )
+        {
+            // H atom
+            if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 )
+            {
                 workspace->hbond_index[i] = workspace->num_H++;
-            else workspace->hbond_index[i] = -1;
-
-        Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
-                hb_top, (*lists)+HBONDS );
-        num_hbonds = hb_top[system->N-1];
+            }
+            else
+            {
+                workspace->hbond_index[i] = -1;
+            }
+        }
 
-#ifdef __DEBUG_CUDA__
-        fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
-#endif
+        Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index,
+                hb_top, (*lists) + HBONDS );
+        num_hbonds = hb_top[system->N - 1];
 
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
-        fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-                num_hbonds * sizeof(hbond_data) / (1024*1024) );
+        fprintf( stderr, "memory allocated: hbonds = %ldMB\n",
+                 num_hbonds * sizeof(hbond_data) / (1024 * 1024) );
 #endif
     }
 
     /* bonds list */
-    Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
-    num_bonds = bond_top[system->N-1];
+    Allocate_Bond_List( system->N, bond_top, (*lists) + BONDS );
+    num_bonds = bond_top[system->N - 1];
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n",
+             num_bonds * sizeof(bond_data) / (1024 * 1024) );
 #endif
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " host num_3body : %d \n", num_3body);
-    fprintf (stderr, " host num_bonds : %d \n", num_bonds);
-#endif
+//fprintf (stderr, " **** sizeof 3 body : %d \n", sizeof (three_body_interaction_data));
+//fprintf (stderr, " **** num_3body : %d \n", num_3body);
+//fprintf (stderr, " **** num_bonds : %d \n", num_bonds);
 
     /* 3bodies list */
-    if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES )) {
+    if (!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists) + THREE_BODIES))
+    {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
-    fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
-            num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: 3-body = %ldMB\n",
+             num_3body * sizeof(three_body_interaction_data) / (1024 * 1024) );
 #endif
 
 #ifdef TEST_FORCES
-    if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
+    if (!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA ))
+    {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
+    if ( !Make_List( num_bonds, num_bonds * MAX_BONDS * 3, TYP_DBO, (*lists) + DBO ) )
+    {
         fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 #endif
 
@@ -632,83 +704,91 @@ void Init_Lists( reax_system *system, control_params *control,
 }
 
 
-void Init_Out_Controls(reax_system *system, control_params *control, 
+void Init_Out_Controls(reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control)
 {
     char temp[1000];
 
     /* Init trajectory file */
-    if( out_control->write_steps > 0 ) { 
+    if ( out_control->write_steps > 0 )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".trj" );
         out_control->trj = fopen( temp, "w" );
         out_control->write_header( system, control, workspace, out_control );
     }
 
-    if( out_control->energy_update_freq > 0 ) {
+    if ( out_control->energy_update_freq > 0 )
+    {
         /* Init out file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".out" );
         out_control->out = fopen( temp, "w" );
         fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
-                "step", "total energy", "poten. energy", "kin. energy", 
-                "temp.", "target", "volume", "press.", "target" );
+                 "step", "total energy", "poten. energy", "kin. energy",
+                 "temp.", "target", "volume", "press.", "target" );
         fflush( out_control->out );
 
         /* Init potentials file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".pot" );
         out_control->pot = fopen( temp, "w" );
-        fprintf( out_control->pot, 
-                "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
-                "etor", "econj", "evdw","ecoul", "epol" );
+        fprintf( out_control->pot,
+                 "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                 "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb",
+                 "etor", "econj", "evdw", "ecoul", "epol" );
         fflush( out_control->pot );
 
         /* Init log file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".log" );
         out_control->log = fopen( temp, "w" );
-        fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
-                "step", "total", "neighbors", "init", "bonded", 
-                "nonbonded", "QEq", "matvec" );
+        fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                 "step", "total", "neighbors", "init", "bonded",
+                 "nonbonded", "QEq", "QEq Sort", "S iters", "Pre Comp", "Pre App",
+                 "S spmv", "S vec ops", "S orthog", "S tsolve" );
     }
 
     /* Init pressure file */
-    if( control->ensemble == NPT || 
-            control->ensemble == iNPT || 
-            control->ensemble == sNPT ) {
+    if ( control->ensemble == NPT ||
+            control->ensemble == iNPT ||
+            control->ensemble == sNPT )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".prs" );
         out_control->prs = fopen( temp, "w" );
         fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                "step", "norm_x", "norm_y", "norm_z", 
-                "press_x", "press_y", "press_z", "target_p", "volume" );
+                 "step", "norm_x", "norm_y", "norm_z",
+                 "press_x", "press_y", "press_z", "target_p", "volume" );
         fflush( out_control->prs );
     }
 
     /* Init molecular analysis file */
-    if( control->molec_anal ) {
+    if ( control->molec_anal )
+    {
         sprintf( temp, "%s.mol", control->sim_name );
         out_control->mol = fopen( temp, "w" );
-        if( control->num_ignored ) {
+        if ( control->num_ignored )
+        {
             sprintf( temp, "%s.ign", control->sim_name );
             out_control->ign = fopen( temp, "w" );
-        } 
+        }
     }
 
     /* Init electric dipole moment analysis file */
-    if( control->dipole_anal ) {
+    if ( control->dipole_anal )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".dpl" );
         out_control->dpl = fopen( temp, "w" );
-        fprintf( out_control->dpl, 
-                "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
+        fprintf( out_control->dpl,
+                 "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
         fflush( out_control->dpl );
     }
 
     /* Init diffusion coef analysis file */
-    if( control->diffusion_coef ) {
+    if ( control->diffusion_coef )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".drft" );
         out_control->drft = fopen( temp, "w" );
@@ -836,21 +916,22 @@ void Init_Out_Controls(reax_system *system, control_params *control,
 #endif
 
     /* Error handling */
-    /* if ( out_control->out == NULL || out_control->pot == NULL || 
-       out_control->log == NULL || out_control->mol == NULL || 
-       out_control->dpl == NULL || out_control->drft == NULL ||       
+    /* if ( out_control->out == NULL || out_control->pot == NULL ||
+       out_control->log == NULL || out_control->mol == NULL ||
+       out_control->dpl == NULL || out_control->drft == NULL ||
        out_control->pdb == NULL )
        {
        fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
-       exit( CANNOT_OPEN_OUTFILE );
+       exit( CANNOT_OPEN_FILE );
        }*/
 }
 
 
-void Initialize(reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, list **lists, 
+void Initialize(reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control, evolve_function *Evolve)
 {
+    real start, end;
     Randomize();
 
     Init_System( system, control, data );
@@ -870,10 +951,16 @@ void Initialize(reax_system *system, control_params *control,
     Init_Force_Test_Functions( );
 #endif
 
-    if( control->tabulate )
+    if ( control->tabulate )
+    {
+        start = Get_Time ();
         Make_LR_Lookup_Table( system, control );
+        end = Get_Timing_Info (start);
+
+        //fprintf (stderr, "Time for LR Lookup Table calculation is %f \n", end );
+    }
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "data structures have been initialized...\n" ); 
+    fprintf( stderr, "data structures have been initialized...\n" );
 #endif
 }
diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h
index 8c23806594a8f2b107ddb884efbf68e7b5fe27ff..947d81e6e50e96f325742c6d024c6011a900152c 100644
--- a/PuReMD-GPU/src/init_md.h
+++ b/PuReMD-GPU/src/init_md.h
@@ -31,10 +31,10 @@ extern "C"  {
 void Initialize( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls*, evolve_function* );
 
-void Generate_Initial_Velocities(reax_system *, real );
+void Generate_Initial_Velocities( reax_system *, real );
 
-void Init_Out_Controls(reax_system *, control_params *, static_storage *,
-        output_controls *);
+void Init_Out_Controls( reax_system *, control_params *, static_storage *,
+        output_controls * );
 
 #ifdef __cplusplus
 }
diff --git a/PuReMD-GPU/src/integrate.c b/PuReMD-GPU/src/integrate.c
index 482a9c89a302c052e9ac44ae2de446c61b1c6a3e..d65406f8824697a396c65337bd197ad771320e81 100644
--- a/PuReMD-GPU/src/integrate.c
+++ b/PuReMD-GPU/src/integrate.c
@@ -1,32 +1,32 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "integrate.h"
-
 #include "allocate.h"
 #include "box.h"
 #include "forces.h"
 #include "grid.h"
 #include "neighbors.h"
 #include "print_utils.h"
-#include "QEq.h"
+#include "qeq.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
@@ -34,9 +34,10 @@
 #include "list.h"
 
 
-void Velocity_Verlet_NVE(reax_system* system, control_params* control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+void Velocity_Verlet_NVE(reax_system* system, control_params* control,
+                         simulation_data *data, static_storage *workspace,
+                         list **lists, output_controls *out_control )
 {
     int i, steps, renbr;
     real inv_m, dt, dt_sqr;
@@ -46,53 +47,50 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
 
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &( system->box ) );
 
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - ");
 #endif
 
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr )
-    {
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );  
-    }
+    if ( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2\n");
 #endif
 }
 
 
-void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
-        control_params* control, 
-        simulation_data *data, 
-        static_storage *workspace, 
-        list **lists, 
+
+void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
+        control_params* control,
+        simulation_data *data,
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, itr, steps, renbr;
@@ -106,22 +104,17 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein:  coef to update velocity --> %6.10f\n", therm->v_xi_old);
-#endif
-
     /* Compute x(t + dt) and copy old forces */
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
-                0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
+                        0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
 
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
 
@@ -129,105 +122,88 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     }
     /* Compute xi(t + dt) */
     therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-
-    if( renbr )
-    {
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );
-    }
-
+    if ( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     /* Calculate Forces at time (t + dt) */
-    Compute_Forces( system,control,data, workspace, lists, out_control );
+    Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* Compute iteration constants for each atom's velocity */
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_Scale( workspace->v_const[i], 
-                1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
-        rvec_ScaledAdd( workspace->v_const[i], 
-                0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
-        rvec_ScaledAdd( workspace->v_const[i], 
-                0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
+        rvec_Scale( workspace->v_const[i],
+                    1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
+        rvec_ScaledAdd( workspace->v_const[i],
+                        0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
+        rvec_ScaledAdd( workspace->v_const[i],
+                        0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
 #if defined(DEBUG)
-        fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", 
-                i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, 
-                0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], 
-                workspace->v_const[i][1], workspace->v_const[i][2] );  
+        fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n",
+                 i, inv_m, 1.0 - 0.5 * dt * therm->v_xi,
+                 0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0],
+                 workspace->v_const[i][1], workspace->v_const[i][2] );
 #endif
     }
 
     v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
     E_kin_new = G_xi_new = v_xi_old = 0;
     itr = 0;
-    do {
-        itr++;      
+    do
+    {
+        itr++;
 
         /* new values become old in this iteration */
         v_xi_old = v_xi_new;
         coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
         E_kin_new = 0;
-
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
-        //print_sys_atoms (system);
-#endif
-
-        for( i = 0; i < system->N; ++i )
+        for ( i = 0; i < system->N; ++i )
         {
             rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
 
-            E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * 
-                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+            E_kin_new += ( 0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                           rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
 #if defined(DEBUG)
-            fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", 
-                    itr, i, coef_v, v_xi_old );
+            fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n",
+                     itr, i, coef_v, v_xi_old );
 #endif
         }
 
-        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
-                data->N_f * K_B * control->T );
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new -
+                                      data->N_f * K_B * control->T );
         v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
-
 #if defined(DEBUG)
         fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
-                itr, G_xi_new, v_xi_new, v_xi_old );
+                 itr, G_xi_new, v_xi_new, v_xi_old );
 #endif
     }
-    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+    while ( fabs(v_xi_new - v_xi_old ) > 1e-5 );
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
-#endif
-
-#ifndef __BUILD_DEBUG__
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
-    therm->G_xi = G_xi_new;  
-#endif 
-
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr,"vel scale\n" );
-#endif 
+    therm->G_xi = G_xi_new;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "vel scale\n" );
+#endif
 }
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, 
-        control_params* control, 
+void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
+        control_params* control,
         simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, steps, renbr;
@@ -237,94 +213,102 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
-    //fprintf( out_control->prs, 
-    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //fprintf( out_control->prs,
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n",
     //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
     fprintf( stderr, "step %d: ", data->step );
 #endif
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-          workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n",
+          workspace->orig_id[i],
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1],
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
-    Reallocate( system, workspace, lists, renbr );  
+    Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr ) {
+    if ( renbr )
+    {
         Update_Grid( system );
         Generate_Neighbor_Lists( system, control, data, workspace,
-                lists, out_control );
+                                 lists, out_control );
     }
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute v(t + dt) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-           workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n",
+           workspace->orig_id[i],
            system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1],
            0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
     }
-    //Compute_Kinetic_Energy( system, data );   
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
     Compute_Pressure_Isotropic( system, control, data, out_control );
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2 - " );
 #endif
 
     /* pressure scaler */
     mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
-            1.0 / 3 );
-    if( mu < MIN_dV ) 
+              1.0 / 3 );
+    if ( mu < MIN_dV )
         mu = MIN_dV;
-    else if( mu > MAX_dV )
+    else if ( mu > MAX_dV )
         mu = MAX_dV;
 
     /* temperature scaler */
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
         lambda = MIN_dT;
     else if (lambda > MAX_dT )
         lambda = MAX_dT;
     lambda = SQRT( lambda );
 
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-        /* IMPORTANT: What Adri does with scaling positions first to 
-           unit coordinates and then back to cartesian coordinates essentially 
-           is scaling the coordinates with mu^2. However, this causes unphysical 
+        /* IMPORTANT: What Adri does with scaling positions first to
+           unit coordinates and then back to cartesian coordinates essentially
+           is scaling the coordinates with mu^2. However, this causes unphysical
            modifications on the system because box dimensions
            are being scaled with mu! We need to discuss this with Adri! */
         rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x );
     }
-    //Compute_Kinetic_Energy( system, data );
-#if defined(DEBUG_FOCUS)  
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
+
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "scaling - " );
 #endif
 
@@ -335,14 +319,14 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
 }
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, 
-        control_params* control, 
+void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
+        control_params* control,
         simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, d, steps, renbr;
@@ -352,120 +336,139 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
-    //fprintf( out_control->prs, 
-    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //fprintf( out_control->prs,
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n",
     //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
     fprintf( stderr, "step %d: ", data->step );
 #endif
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ ) {
-        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; 
+    for ( i = 0; i < system->N; i++ )
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-          workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n",
+          workspace->orig_id[i],
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1],
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
     }
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
-    Reallocate( system, workspace, lists, renbr ); 
+    Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr ) {
+    if ( renbr )
+    {
         Update_Grid( system );
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     }
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute v(t + dt) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-           workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n",
+           workspace->orig_id[i],
            system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1],
            0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
     }
-    //Compute_Kinetic_Energy( system, data );   
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
     Compute_Pressure_Isotropic( system, control, data, out_control );
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2 - " );
 #endif
 
     /* pressure scaler */
-    for( d = 0; d < 3; ++d ){
-        mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]),
-                1.0 / 3 );
-        if( mu[d] < MIN_dV ) 
+    for ( d = 0; d < 3; ++d )
+    {
+        mu[d] = POW( 1.0 + (dt / control->Tau_P[d]) * (data->tot_press[d] - control->P[d]),
+                     1.0 / 3 );
+        if ( mu[d] < MIN_dV )
+        {
             mu[d] = MIN_dV;
-        else if( mu[d] > MAX_dV )
+        }
+        else if ( mu[d] > MAX_dV )
+        {
             mu[d] = MAX_dV;
+        }
     }
 
     /* temperature scaler */
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
+    {
         lambda = MIN_dT;
+    }
     else if (lambda > MAX_dT )
+    {
         lambda = MAX_dT;
+    }
     lambda = SQRT( lambda );
 
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-        /* IMPORTANT: What Adri does with scaling positions first to 
-           unit coordinates and then back to cartesian coordinates essentially 
-           is scaling the coordinates with mu^2. However, this causes unphysical 
+        /* IMPORTANT: What Adri does with scaling positions first to
+           unit coordinates and then back to cartesian coordinates essentially
+           is scaling the coordinates with mu^2. However, this causes unphysical
            modifications on the system because box dimensions
            are being scaled with mu! We need to discuss this with Adri! */
-        for( d = 0; d < 3; ++d )
+        for ( d = 0; d < 3; ++d )
             system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d];
     }
-    //Compute_Kinetic_Energy( system, data );
-#if defined(DEBUG_FOCUS)  
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
+
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "scaling - " );
 #endif
 
     Update_Box_SemiIsotropic( &(system->box), mu );
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "updated box & grid\n" );
 #endif
 }
 
 
+
 /************************************************/
 /* BELOW FUNCTIONS ARE NOT BEING USED ANYMORE!  */
 /*                                              */
 /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
 /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
 /************************************************/
+
 #ifdef ANISOTROPIC
 
-void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, 
-        control_params* control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
-        output_controls *out_control )
+void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
+                                     control_params* control,
+                                     simulation_data *data,
+                                     static_storage *workspace,
+                                     list **lists,
+                                     output_controls *out_control )
 {
     int i;
     real inv_m;
@@ -473,73 +476,77 @@ void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
     real dt_sqr = SQR(dt);
     rvec dx;
 
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // Compute x(t + dt)
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
         Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) );
 
         // Compute v(t + dt/2)
-        rvec_ScaledAdd( system->atoms[i].v, 
-                -0.5 * dt * data->therm.xi, system->atoms[i].v );
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
 
     // Compute zeta(t + dt/2), E_Kininetic(t + dt/2)
     // IMPORTANT: What will be the initial value of zeta? and what is g?
-    data->therm.xi += 0.5 * dt * control->Tau_T  * 
-        ( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
+    data->therm.xi += 0.5 * dt * control->Tau_T  *
+                      ( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
 
     Reset( system, control, data, workspace );
-    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+    fprintf(out_control->log, "reset-");
+    fflush( out_control->log );
 
-    Generate_Neighbor_Lists( system, control, data, workspace, 
-            lists, out_control );
-    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+    Generate_Neighbor_Lists( system, control, data, workspace,
+                             lists, out_control );
+    fprintf(out_control->log, "nbrs-");
+    fflush( out_control->log );
 
     /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
-    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
+    fprintf(out_control->log, "forces\n");
+    fflush( out_control->log );
 
-    //Compute_Kinetic_Energy( system, data );
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
 
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // compute v(t + dt)
-        rvec_ScaledAdd( system->atoms[i].v, 
-                -0.5 * dt * data->therm.xi, system->atoms[i].v );
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
 
     // Compute zeta(t + dt)
-    data->therm.xi += 0.5*dt * control->Tau_T  * ( 2.0 * data->E_Kin - 
-            data->N_f * K_B * control->T );
+    data->therm.xi += 0.5 * dt * control->Tau_T  * ( 2.0 * data->E_Kin -
+                      data->N_f * K_B * control->T );
 
-    fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", 
-            data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); 
+    fprintf( out_control->log, "Xi: %8.3f %8.3f %8.3f\n",
+             data->therm.xi, data->E_Kin, data->N_f * K_B * control->T );
     fflush( out_control->log );
 }
 
 
-void Velocity_Verlet_Isotropic_NPT( reax_system* system, 
-        control_params* control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+
+void Velocity_Verlet_Isotropic_NPT( reax_system* system, control_params* control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, itr;
-    real deps, v_eps_new=0, v_eps_old=0, G_xi_new;
-    real dxi, v_xi_new=0, v_xi_old=0, a_eps_new;
+    real deps, v_eps_new = 0, v_eps_old = 0, G_xi_new;
+    real dxi, v_xi_new = 0, v_xi_old = 0, a_eps_new;
     real inv_m, exp_deps, inv_3V;
     real E_kin, P_int, P_int_const;
     real coef_v, coef_v_eps;
@@ -552,37 +559,37 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
 
     // Here we just calculate how much to increment eps, xi, v_eps, v_xi.
     // Commits are done after positions and velocities of atoms are updated
-    // because position, velocity updates uses v_eps, v_xi terms; 
-    // yet we need EXP( deps ) to be able to calculate 
-    // positions and velocities accurately.  
-    iso_bar->a_eps = control->Tau_P * 
-        ( 3.0 * box->volume * (iso_bar->P - control->P) + 
-          6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
+    // because position, velocity updates uses v_eps, v_xi terms;
+    // yet we need EXP( deps ) to be able to calculate
+    // positions and velocities accurately.
+    iso_bar->a_eps = control->Tau_P *
+            ( 3.0 * box->volume * (iso_bar->P - control->P) +
+            6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
     deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps;
     exp_deps = EXP( deps );
 
-    therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + 
-            SQR( iso_bar->v_eps ) / control->Tau_P - 
-            (data->N_f +1) * K_B * control->T );
+    therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin +
+            SQR( iso_bar->v_eps ) / control->Tau_P -
+            (data->N_f + 1) * K_B * control->T );
     dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr;
 
-    fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n", 
+    fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n",
             iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-    fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n", 
+    fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n",
             therm->G_xi, therm->v_xi, therm->xi );
 
     // Update positions and velocities
-    // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values 
+    // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values
     // in the iteration not the old values at time t or before!
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // Compute x(t + dt)
-        rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, 
-                -( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ),
-                system->atoms[i].v );
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+        rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f,
+                        -( (2.0 + 3.0 / data->N_f) * iso_bar->v_eps + therm->v_xi ),
+                        system->atoms[i].v );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
                 0.5 * dt_sqr, workspace->a[i] );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x );
@@ -597,39 +604,40 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
 
     // Calculate new forces, f(t + dt)
     Reset( system, control, data, workspace );
-    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+    fprintf(out_control->log, "reset-");
+    fflush( out_control->log );
 
-    Generate_Neighbor_Lists( system, control, data, workspace, 
-            lists, out_control );
-    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+    Generate_Neighbor_Lists( system, control, data, workspace,
+                             lists, out_control );
+    fprintf(out_control->log, "nbrs-");
+    fflush( out_control->log );
 
     /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
-    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
-
+    fprintf(out_control->log, "forces\n");
+    fflush( out_control->log );
 
     // Compute iteration constants for each atom's velocity and for P_internal
     // Compute kinetic energy for initial velocities of the iteration
     P_int_const = E_kin = 0;
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i],
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
         rvec_Add( dv, system->atoms[i].v );
         rvec_Scale( workspace->v_const[i], exp_deps, dv );
 
-        P_int_const += ( -F_CONV * 
-                rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
+        P_int_const += ( -F_CONV *
+                         rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
 
-        E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-                rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+        E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                  rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
     }
 
-
     // Compute initial p_int
     inv_3V = 1.0 / (3.0 * system->box.volume);
     P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
@@ -645,42 +653,38 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
         v_xi_old = v_xi_new;
         v_eps_old = v_eps_new;
 
-
-        for( i = 0; i < system->N; ++i )
+        for ( i = 0; i < system->N; ++i )
         {
-            coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * 
-                    ( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) );
+            coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps *
+                    ( (2.0 + 3.0 / data->N_f) * v_eps_old + v_xi_old ) );
             rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
         }
 
-
         coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-        a_eps_new = 3.0 * control->Tau_P * 
-            ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
-        v_eps_new = coef_v_eps * ( iso_bar->v_eps + 
+        a_eps_new = 3.0 * control->Tau_P *
+                ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
+        v_eps_new = coef_v_eps * ( iso_bar->v_eps +
                 0.5 * dt * ( iso_bar->a_eps + a_eps_new ) );
 
-
-        G_xi_new = control->Tau_T * ( 2.0 * E_kin + 
-                SQR( v_eps_old ) / control->Tau_P - 
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin +
+                SQR( v_eps_old ) / control->Tau_P -
                 (data->N_f + 1) * K_B * control->T );
         v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 
-
         E_kin = 0;
-        for( i = 0; i < system->N; ++i )
-            E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
-
-        P_int = inv_3V * ( 2.0*E_kin + P_int_const );
+        for ( i = 0; i < system->N; ++i )
+        {
+            E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                      rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+        }
 
+        P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
 
-        fprintf( out_control->log, 
-                "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", 
-                itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
+        fprintf( out_control->log,
+               "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n",
+               itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
     }
-    while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
-
+    while ( FABS(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
 
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
@@ -690,36 +694,30 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
     iso_bar->v_eps = v_eps_new;
     iso_bar->a_eps = a_eps_new;
 
-    fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", 
-            system->box.volume, 
-            system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] );
-    fprintf(out_control->log,"eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n", 
+    fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n",
+             system->box.volume,
+             system->box.box[0][0], system->box.box[1][1], system->box.box[2][2] );
+    fprintf(out_control->log, "eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n",
             iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-    fprintf(out_control->log,"xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n", 
+    fprintf(out_control->log, "xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n",
             therm->G_xi, therm->v_xi, therm->xi);
 }
 
 #endif
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_NVT( reax_system* system,
-        control_params* control,
-        simulation_data *data,
-        static_storage *workspace,
-        list **lists,
-        output_controls *out_control
-        )
+void Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i, steps, renbr;
     real inv_m, dt, lambda;
     rvec dx;
     reax_atom *atom;
 
-    fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step);
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d\n", data->step );
 #endif
@@ -729,12 +727,19 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
     renbr = (steps % control->reneighbor == 0);
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         atom = &(system->atoms[i]);
         inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
         /* Compute x(t + dt) */
         rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-        rvec_Add( atom->x, dx );
+
+        //TODO: used rvec_Add in GPU version -- which is correct?
+        /* bNVT fix - Metin's suggestion */
+        /* ORIGINAL CHANGE -- CHECK THE branch serial-bnvt for the fix */
+        //rvec_Add( atom->x, dx );
+        Inc_on_T3( atom->x, dx, &( system->box ) );
+
         /* Compute v(t + dt/2) */
         rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
     }
@@ -746,42 +751,50 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
 
-    if( renbr )
+    if ( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control );
+    }
 
-    Compute_Forces( system, control, data, workspace,
-            lists, out_control );
+    Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         atom = &(system->atoms[i]);
         inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
         /* Compute v(t + dt) */
         rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
     }
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf(stderr, "step%d: verlet2 done\n", data->step);
 #endif
 
     /* temperature scaler */
     Compute_Kinetic_Energy( system, data );
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
+    {
         lambda = MIN_dT;
+    }
     else if (lambda > MAX_dT )
+    {
         lambda = MAX_dT;
+    }
     lambda = SQRT( lambda );
 
+    fprintf( stderr, "step:%d lambda -> %f \n", data->step, lambda );
+
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom = &(system->atoms[i]);
         rvec_Scale( atom->v, lambda, atom->v );
     }
     Compute_Kinetic_Energy( system, data );
 
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "step%d: scaled velocities\n",
-            data->step );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d: scaled velocities\n", data->step );
 #endif
 }
diff --git a/PuReMD-GPU/src/integrate.h b/PuReMD-GPU/src/integrate.h
index 6f5848f0de84e8a50ef2c5090194618b61f185fc..55b36c559f96d5e3d45ab69177e26f0d4136d8e1 100644
--- a/PuReMD-GPU/src/integrate.h
+++ b/PuReMD-GPU/src/integrate.h
@@ -23,29 +23,30 @@
 
 #include "mytypes.h"
 
+
 void Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Isotropic_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system*, control_params*,
-        simulation_data*,
-        static_storage*, list**,
-        output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
-        simulation_data *, static_storage *,
-        list **, output_controls * );
+        simulation_data *, static_storage *, list **, output_controls * );
+
 
 #endif
diff --git a/PuReMD-GPU/src/lin_alg.c b/PuReMD-GPU/src/lin_alg.c
index cb141d475b0e2cf702901ed551287e0e238cdcd6..fb1e25bb83273730fcd74f82acad76ec7e5336e1 100644
--- a/PuReMD-GPU/src/lin_alg.c
+++ b/PuReMD-GPU/src/lin_alg.c
@@ -1,319 +1,1654 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "lin_alg.h"
 
+#include "allocate.h"
 #include "list.h"
+#include "print_utils.h"
+#include "tool_box.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Sparse_MatVec( sparse_matrix *A, real *x, real *b )
+typedef enum
+{
+    LOWER = 0,
+    UPPER = 1,
+} TRIANGULARITY;
+
+
+/* global to make OpenMP shared (Sparse_MatVec) */
+#ifdef _OPENMP
+real *b_local = NULL;
+#endif
+/* global to make OpenMP shared (apply_preconditioner) */
+real *Dinv_L = NULL, *Dinv_U = NULL;
+/* global to make OpenMP shared (tri_solve_level_sched) */
+int levels = 1;
+int levels_L = 1, levels_U = 1;
+unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
+unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
+unsigned int *row_levels, *level_rows, *level_rows_cnt;
+unsigned int *top = NULL;
+/* global to make OpenMP shared (graph_coloring) */
+unsigned int *color = NULL;
+unsigned int *to_color = NULL;
+unsigned int *conflict = NULL;
+unsigned int *temp_ptr;
+unsigned int *recolor = NULL;
+unsigned int recolor_cnt;
+unsigned int *color_top = NULL;
+/* global to make OpenMP shared (sort_colors) */
+unsigned int *permuted_row_col = NULL;
+unsigned int *permuted_row_col_inv = NULL;
+real *y_p = NULL;
+/* global to make OpenMP shared (permute_vector) */
+real *x_p = NULL;
+unsigned int *mapping = NULL;
+sparse_matrix *H_full;
+sparse_matrix *H_p;
+/* global to make OpenMP shared (jacobi_iter) */
+real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
+
+
+/* sparse matrix-vector product Ax=b
+ * where:
+ *   A: lower triangular matrix, stored in CSR format
+ *   x: vector
+ *   b: vector (result) */
+static void Sparse_MatVec( const sparse_matrix * const A,
+        const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
+#ifdef _OPENMP
+    unsigned int tid;
+#endif
 
     n = A->n;
-    for( i = 0; i < n; ++i )
-        b[i] = 0;
+    Vector_MakeZero( b, n );
+
+#ifdef _OPENMP
+    tid = omp_get_thread_num();
+
+    #pragma omp master
+    {
+
+        /* keep b_local for program duration to avoid allocate/free
+         * overhead per Sparse_MatVec call*/
+        if ( b_local == NULL )
+        {
+            if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL )
+            {
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
+
+#endif
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        si = A->start[i];
+        ei = A->start[i + 1] - 1;
+
+        for ( k = si; k < ei; ++k )
+        {
+            j = A->j[k];
+            H = A->val[k];
+#ifdef _OPENMP
+            b_local[tid * n + j] += H * x[i];
+            b_local[tid * n + i] += H * x[j];
+#else
+            b[j] += H * x[i];
+            b[i] += H * x[j];
+#endif
+        }
+
+        // the diagonal entry is the last one in
+#ifdef _OPENMP
+        b_local[tid * n + i] += A->val[k] * x[i];
+#else
+        b[i] += A->val[k] * x[i];
+#endif
+    }
+#ifdef _OPENMP
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        for ( j = 0; j < omp_get_num_threads(); ++j )
+        {
+            b[i] += b_local[j * n + i];
+        }
+    }
+#endif
+
+}
+
+
+/* Transpose A and copy into A^T
+ *
+ * A: stored in CSR
+ * A_t: stored in CSR
+ */
+void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
+{
+    unsigned int i, j, pj, *A_t_top;
+
+    if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL )
+    {
+        fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
+
+    /* count nonzeros in each column of A^T, store one row greater (see next loop) */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            ++A_t->start[A->j[pj] + 1];
+        }
+    }
+
+    /* setup the row pointers for A^T */
+    for ( i = 1; i <= A->n; ++i )
+    {
+        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
+    }
+
+    /* fill in A^T */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            j = A->j[pj];
+            A_t->j[A_t_top[j]] = i;
+            A_t->val[A_t_top[j]] = A->val[pj];
+            ++A_t_top[j];
+        }
+    }
+
+    free( A_t_top );
+}
+
+
+/* Transpose A in-place
+ *
+ * A: stored in CSR
+ */
+void Transpose_I( sparse_matrix * const A )
+{
+    sparse_matrix * A_t;
+
+    if ( Allocate_Matrix( A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    Transpose( A, A_t );
+
+    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
+    memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
+    memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );
+
+    Deallocate_Matrix( A_t );
+}
+
+
+/* Apply diagonal inverse (Jacobi) preconditioner to system residual
+ *
+ * Hdia_inv: diagonal inverse preconditioner (constructed using H)
+ * y: current residual
+ * x: preconditioned residual
+ * N: length of preconditioner and vectors (# rows in H)
+ */
+static void diag_pre_app( const real * const Hdia_inv, const real * const y,
+                          real * const x, const int N )
+{
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < N; ++i )
+    {
+        x[i] = y[i] * Hdia_inv[i];
+    }
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * LU: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of LU (lower/upper)
+ *
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+static void tri_solve( const sparse_matrix * const LU, const real * const y,
+                       real * const x, const TRIANGULARITY tri )
+{
+    int i, pj, j, si, ei;
+    real val;
+
+    #pragma omp master
+    {
+        if ( tri == LOWER )
+        {
+            for ( i = 0; i < LU->n; ++i )
+            {
+                x[i] = y[i];
+                si = LU->start[i];
+                ei = LU->start[i + 1];
+                for ( pj = si; pj < ei - 1; ++pj )
+                {
+                    j = LU->j[pj];
+                    val = LU->val[pj];
+                    x[i] -= val * x[j];
+                }
+                x[i] /= LU->val[pj];
+            }
+        }
+        else
+        {
+            for ( i = LU->n - 1; i >= 0; --i )
+            {
+                x[i] = y[i];
+                si = LU->start[i];
+                ei = LU->start[i + 1];
+                for ( pj = si + 1; pj < ei; ++pj )
+                {
+                    j = LU->j[pj];
+                    val = LU->val[pj];
+                    x[i] -= val * x[j];
+                }
+                x[i] /= LU->val[si];
+            }
+        }
+    }
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * LU: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of LU (lower/upper)
+ * find_levels: perform level search if positive, otherwise reuse existing levels
+ *
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
+                                   real * const x, const TRIANGULARITY tri, int find_levels )
+{
+    int i, j, pj, local_row, local_level;
+
+    #pragma omp master
+    {
+        if ( tri == LOWER )
+        {
+            row_levels = row_levels_L;
+            level_rows = level_rows_L;
+            level_rows_cnt = level_rows_cnt_L;
+            levels = levels_L;
+        }
+        else
+        {
+            row_levels = row_levels_U;
+            level_rows = level_rows_U;
+            level_rows_cnt = level_rows_cnt_U;
+            levels = levels_U;
+        }
+
+        if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
+        {
+            if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
+                    || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
+                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            {
+                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        if ( top == NULL )
+        {
+            if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            {
+                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        /* find levels (row dependencies in substitutions) */
+        if ( find_levels == TRUE )
+        {
+            memset( row_levels, 0, LU->n * sizeof(unsigned int) );
+            memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) );
+            memset( top, 0, LU->n * sizeof(unsigned int) );
+            levels = 1;
+
+            if ( tri == LOWER )
+            {
+                for ( i = 0; i < LU->n; ++i )
+                {
+                    local_level = 1;
+                    for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
+                    {
+                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                    }
+
+                    levels = MAX( levels, local_level );
+                    row_levels[i] = local_level;
+                    ++level_rows_cnt[local_level];
+                }
+
+//#if defined(DEBUG)
+                fprintf(stderr, "levels(L): %d\n", levels);
+                fprintf(stderr, "NNZ(L): %d\n", LU->start[LU->n]);
+//#endif
+            }
+            else
+            {
+                for ( i = LU->n - 1; i >= 0; --i )
+                {
+                    local_level = 1;
+                    for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
+                    {
+                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                    }
+
+                    levels = MAX( levels, local_level );
+                    row_levels[i] = local_level;
+                    ++level_rows_cnt[local_level];
+                }
+
+//#if defined(DEBUG)
+                fprintf(stderr, "levels(U): %d\n", levels);
+                fprintf(stderr, "NNZ(U): %d\n", LU->start[LU->n]);
+//#endif
+            }
+
+            for ( i = 1; i < levels + 1; ++i )
+            {
+                level_rows_cnt[i] += level_rows_cnt[i - 1];
+                top[i] = level_rows_cnt[i];
+            }
+
+            for ( i = 0; i < LU->n; ++i )
+            {
+                level_rows[top[row_levels[i] - 1]] = i;
+                ++top[row_levels[i] - 1];
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    /* perform substitutions by level */
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < levels; ++i )
+        {
+            #pragma omp for schedule(static)
+            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
+            {
+                local_row = level_rows[j];
+                x[local_row] = y[local_row];
+                for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
+                {
+                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
+
+                }
+                x[local_row] /= LU->val[pj];
+            }
+        }
+    }
+    else
+    {
+        for ( i = 0; i < levels; ++i )
+        {
+            #pragma omp for schedule(static)
+            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
+            {
+                local_row = level_rows[j];
+                x[local_row] = y[local_row];
+                for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
+                {
+                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
+
+                }
+                x[local_row] /= LU->val[LU->start[local_row]];
+            }
+        }
+    }
+
+    #pragma omp master
+    {
+        /* save level info for re-use if performing repeated triangular solves via preconditioning */
+        if ( tri == LOWER )
+        {
+            row_levels_L = row_levels;
+            level_rows_L = level_rows;
+            level_rows_cnt_L = level_rows_cnt;
+            levels_L = levels;
+        }
+        else
+        {
+            row_levels_U = row_levels;
+            level_rows_U = level_rows;
+            level_rows_cnt_U = level_rows_cnt;
+            levels_U = levels;
+        }
+    }
+
+    #pragma omp barrier
+}
+
+
+static void compute_H_full( const sparse_matrix * const H )
+{
+    int count, i, pj;
+    sparse_matrix *H_t;
+
+    if ( Allocate_Matrix( H_t, H->n, H->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for full H. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( H, H_t );
+
+    count = 0;
+    for ( i = 0; i < H->n; ++i )
+    {
+        H_full->start[i] = count;
+
+        /* H: symmetric, lower triangular portion only stored */
+        for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
+        {
+            H_full->val[count] = H->val[pj];
+            H_full->j[count] = H->j[pj];
+            ++count;
+        }
+        /* H^T: symmetric, upper triangular portion only stored; 
+         * skip diagonal from H^T, as included from H above */
+        for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
+        {
+            H_full->val[count] = H_t->val[pj];
+            H_full->j[count] = H_t->j[pj];
+            ++count;
+        }
+    }
+    H_full->start[i] = count;
+
+    Deallocate_Matrix( H_t );
+}
+
+
+/* Iterative greedy shared-memory parallel graph coloring
+ *
+ * A: matrix to use for coloring, stored in CSR format;
+ *   rows represent vertices, columns of entries within a row represent adjacent vertices
+ *   (i.e., dependent rows for elimination during LU factorization)
+ * tri: triangularity of LU (lower/upper)
+ * color: vertex color (1-based)
+ *
+ * Reference:
+ * Umit V. Catalyurek et al.
+ * Graph Coloring Algorithms for Multi-core 
+ *  and Massively Threaded Architectures
+ * Parallel Computing, 2012
+ */
+void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
+{
+    #pragma omp parallel
+    {
+#define MAX_COLOR (500)
+        int i, pj, v;
+        unsigned int temp;
+        int *fb_color;
+
+        #pragma omp master
+        {
+            memset( color, 0, sizeof(unsigned int) * A->n );
+            recolor_cnt = A->n;
+        }
+
+        /* ordering of vertices to color depends on triangularity of factor
+         * for which coloring is to be used for */
+        if ( tri == LOWER )
+        {
+            #pragma omp for schedule(static)
+            for ( i = 0; i < A->n; ++i )
+            {
+                to_color[i] = i;
+            }
+        }
+        else
+        {
+            #pragma omp for schedule(static)
+            for ( i = 0; i < A->n; ++i )
+            {
+                to_color[i] = A->n - 1 - i;
+            }
+        }
+
+        if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL )
+        {
+            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        #pragma omp barrier
+
+        while ( recolor_cnt > 0 )
+        {
+            memset( fb_color, -1, sizeof(int) * MAX_COLOR );
+
+            /* color vertices */
+            #pragma omp for schedule(static)
+            for ( i = 0; i < recolor_cnt; ++i )
+            {
+                v = to_color[i];
+
+                /* colors of adjacent vertices are forbidden */
+                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                {
+                    if ( v != A->j[pj] )
+                    {
+                        fb_color[color[A->j[pj]]] = v;
+                    }
+                }
+
+                /* search for min. color which is not in conflict with adjacent vertices;
+                 * start at 1 since 0 is default (invalid) color for all vertices */
+                for ( pj = 1; fb_color[pj] == v; ++pj );
+
+                /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
+                color[v] = pj;
+            }
+
+            /* determine if recoloring required */
+            //TODO: switch to reduction on recolor_cnt (+) via parallel scan through recolor
+            #pragma omp master
+            {
+                temp = recolor_cnt;
+                recolor_cnt = 0;
+
+                for ( i = 0; i < temp; ++i )
+                {
+                    v = to_color[i];
+
+                    /* search for color conflicts with adjacent vertices */
+                    for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                    {
+                        if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
+                        {
+                            conflict[recolor_cnt] = v;
+                            color[v] = 0;
+                            ++recolor_cnt;
+                            break;
+                        }
+                    }
+                }
+
+                temp_ptr = to_color;
+                to_color = conflict;
+                conflict = temp_ptr;
+            }
+
+            #pragma omp barrier
+        }
+
+        free( fb_color );
+
+//#if defined(DEBUG)
+//    #pragma omp master
+//    {
+//        for ( i = 0; i < A->n; ++i )
+//            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
+//    }
+//#endif
+
+        #pragma omp barrier
+    }
+}
+
+
+/* Sort coloring
+ *
+ * n: number of entries in coloring
+ * tri: coloring to triangular factor to use (lower/upper)
+ */
+void sort_colors( const unsigned int n, const TRIANGULARITY tri )
+{
+    unsigned int i;
+
+    memset( color_top, 0, sizeof(unsigned int) * (n + 1) );
+
+    /* sort vertices by color (ascending within a color)
+     *  1) count colors
+     *  2) determine offsets of color ranges 
+     *  3) sort by color
+     *
+     *  note: color is 1-based */
+    for ( i = 0; i < n; ++i )
+    {
+        ++color_top[color[i]];
+    }
+    for ( i = 1; i < n + 1; ++i )
+    {
+        color_top[i] += color_top[i - 1];
+    }
+    for ( i = 0; i < n; ++i )
+    {
+        permuted_row_col[color_top[color[i] - 1]] = i;
+        ++color_top[color[i] - 1];
+    }
+
+    /* invert mapping to get map from current row/column to permuted (new) row/column */
+    for ( i = 0; i < n; ++i )
+    {
+        permuted_row_col_inv[permuted_row_col[i]] = i;
+    }
+}
+
+
+/* Apply permutation Q^T*x or Q*x based on graph coloring
+ *
+ * color: vertex color (1-based); vertices represent matrix rows/columns
+ * x: vector to permute (in-place)
+ * n: number of entries in x
+ * invert_map: if TRUE, use Q^T, otherwise use Q
+ * tri: coloring to triangular factor to use (lower/upper)
+ */
+static void permute_vector( real * const x, const unsigned int n, const int invert_map,
+       const TRIANGULARITY tri )
+{
+    unsigned int i;
+
+    #pragma omp master
+    {
+        if ( x_p == NULL )
+        {
+            if ( (x_p = (real*) malloc(sizeof(real) * n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for permuting vector. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        if ( invert_map == TRUE )
+        {
+            mapping = permuted_row_col_inv;
+        }
+        else
+        {
+            mapping = permuted_row_col;
+        }
+    }
+
+    #pragma omp barrier
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        x_p[i] = x[mapping[i]];
+    }
+
+    #pragma omp master
+    {
+        memcpy( x, x_p, sizeof(real) * n );
+    }
+
+    #pragma omp barrier
+}
+
+
+/* Apply permutation Q^T*(LU)*Q based on graph coloring
+ *
+ * color: vertex color (1-based); vertices represent matrix rows/columns
+ * LU: matrix to permute, stored in CSR format
+ * tri: triangularity of LU (lower/upper)
+ */
+void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
+{
+    int i, pj, nr, nc;
+    sparse_matrix *LUtemp;
+
+    if ( Allocate_Matrix( LUtemp, LU->n, LU->m ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
+    memset( color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
+
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < LU->n; ++i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc <= nr )
+                {
+                    ++color_top[nr + 1];
+                }
+                /* correct entries to maintain triangularity (lower) */
+                else
+                {
+                    ++color_top[nc + 1];
+                }
+            }
+        }
+    }
+    else
+    {
+        for ( i = LU->n - 1; i >= 0; --i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc >= nr )
+                {
+                    ++color_top[nr + 1];
+                }
+                /* correct entries to maintain triangularity (upper) */
+                else
+                {
+                    ++color_top[nc + 1];
+                }
+            }
+        }
+    }
+
+    for ( i = 1; i < LU->n + 1; ++i )
+    {
+        color_top[i] += color_top[i - 1];
+    }
+
+    memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );
+
+    /* permute factor */
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < LU->n; ++i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc <= nr )
+                {
+                    LUtemp->j[color_top[nr]] = nc;
+                    LUtemp->val[color_top[nr]] = LU->val[pj];
+                    ++color_top[nr];
+                }
+                /* correct entries to maintain triangularity (lower) */
+                else
+                {
+                    LUtemp->j[color_top[nc]] = nr;
+                    LUtemp->val[color_top[nc]] = LU->val[pj];
+                    ++color_top[nc];
+                }
+            }
+        }
+    }
+    else
+    {
+        for ( i = LU->n - 1; i >= 0; --i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc >= nr )
+                {
+                    LUtemp->j[color_top[nr]] = nc;
+                    LUtemp->val[color_top[nr]] = LU->val[pj];
+                    ++color_top[nr];
+                }
+                /* correct entries to maintain triangularity (upper) */
+                else
+                {
+                    LUtemp->j[color_top[nc]] = nr;
+                    LUtemp->val[color_top[nc]] = LU->val[pj];
+                    ++color_top[nc];
+                }
+            }
+        }
+    }
+
+    memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
+    memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
+    memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );
+
+    Deallocate_Matrix( LUtemp );
+}
+
+
+/* Setup routines to build permuted QEq matrix H (via graph coloring),
+ *  used for preconditioning (incomplete factorizations computed based on
+ *  permuted H)
+ *
+ * H: symmetric, lower triangular portion only, stored in CSR format;
+ *  H is permuted in-place
+ */
+sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
+{
+    if ( color == NULL )
+    {
+        /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
+        if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (H->n + 1))) == NULL ||
+                (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (permuted_row_col_inv = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (y_p = (real*) malloc(sizeof(real) * H->n)) == NULL ||
+                (Allocate_Matrix( H_p, H->n, H->m ) == FAILURE ) ||
+                (Allocate_Matrix( H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
+        {
+            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+
+    compute_H_full( H );
+
+    graph_coloring( H_full, LOWER );
+    sort_colors( H_full->n, LOWER );
+    
+    memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
+    memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
+    memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
+    permute_matrix( H_p, LOWER );
+
+    return H_p;
+}
+
+
+/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
+ * where:
+ *   G = I - D^{-1}R
+ *   R = triangular matrix
+ *   D = diagonal matrix, diagonals from R
+ *
+ * Note: used during the backsolves when applying preconditioners with
+ * triangular factors in iterative linear solvers
+ *
+ * Note: Newmann series arises from series expansion of the inverse of
+ * the coefficient matrix in the triangular system */
+static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
+        const real * const b, real * const x, const TRIANGULARITY tri, const
+        unsigned int maxiter )
+{
+    unsigned int i, k, si = 0, ei = 0, iter;
+
+    iter = 0;
+
+    #pragma omp master
+    {
+        if ( Dinv_b == NULL )
+        {
+            if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        if ( rp == NULL )
+        {
+            if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        if ( rp2 == NULL )
+        {
+            if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    Vector_MakeZero( rp, R->n );
+
+    /* precompute and cache, as invariant in loop below */
+    #pragma omp for schedule(static)
+    for ( i = 0; i < R->n; ++i )
+    {
+        Dinv_b[i] = Dinv[i] * b[i];
+    }
+
+    do
+    {
+        // x_{k+1} = G*x_{k} + Dinv*b;
+        #pragma omp for schedule(guided)
+        for ( i = 0; i < R->n; ++i )
+        {
+            if (tri == LOWER)
+            {
+                si = R->start[i];
+                ei = R->start[i + 1] - 1;
+            }
+            else
+            {
+
+                si = R->start[i] + 1;
+                ei = R->start[i + 1];
+            }
+
+            rp2[i] = 0.;
+
+            for ( k = si; k < ei; ++k )
+            {
+                rp2[i] += R->val[k] * rp[R->j[k]];
+            }
+
+            rp2[i] *= -Dinv[i];
+            rp2[i] += Dinv_b[i];
+        }
+
+        #pragma omp master
+        {
+            rp3 = rp;
+            rp = rp2;
+            rp2 = rp3;
+        }
+
+        #pragma omp barrier
+
+        ++iter;
+    }
+    while ( iter < maxiter );
+
+    Vector_Copy( x, rp, R->n );
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * workspace: data struct containing matrices, lower/upper triangular, stored in CSR
+ * control: data struct containing parameters
+ * y: constants in linear system (RHS)
+ * x: solution
+ * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
+ *
+ * Assumptions:
+ *   Matrices have non-zero diagonals
+ *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
+static void apply_preconditioner( const static_storage * const workspace,
+        const control_params * const control, const real * const y,
+        real * const x, const int fresh_pre )
+{
+    int i, si;
+
+    switch ( control->pre_app_type )
+    {
+    case NONE_PA:
+        break;
+    case TRI_SOLVE_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve( workspace->L, y, x, LOWER );
+            tri_solve( workspace->U, x, x, UPPER );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+        break;
+    case TRI_SOLVE_LEVEL_SCHED_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+        break;
+    case TRI_SOLVE_GC_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            #pragma omp master
+            {
+                memcpy( y_p, y, sizeof(real) * workspace->H->n );
+            }
 
-    for( i = 0; i < n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1]-1;
+            #pragma omp barrier
 
-        for( k = si; k < ei; ++k ) {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            b[j] += H * x[i]; 
-            b[i] += H * x[j];
+            permute_vector( y_p, workspace->H->n, FALSE, LOWER );
+            tri_solve_level_sched( workspace->L, y_p, x, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
+            permute_vector( x, workspace->H->n, TRUE, UPPER );
+        break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
         }
+        break;
+    case JACOBI_ITER_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            #pragma omp master
+            {
+                if ( Dinv_L == NULL )
+                {
+                    if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
+                    {
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
+                    }
+                }
+            }
 
-        // the diagonal entry is the last one in
-        b[i] += A->entries[k].val * x[i]; 
-    }
-}
+            #pragma omp barrier
 
+            /* construct D^{-1}_L */
+            if ( fresh_pre == TRUE )
+            {
+                #pragma omp for schedule(static)
+                for ( i = 0; i < workspace->L->n; ++i )
+                {
+                    si = workspace->L->start[i + 1] - 1;
+                    Dinv_L[i] = 1. / workspace->L->val[si];
+                }
+            }
 
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
-{
-    int i, pj, j, si, ei;
-    real val;
+            jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );
 
-    for( i = 0; i < L->n; ++i ) {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->start[i+1];
-        for( pj = si; pj < ei-1; ++pj ){
-            j = L->entries[pj].j;
-            val = L->entries[pj].val;
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->entries[pj].val;
-    }
-}
+            #pragma omp master
+            {
+                if ( Dinv_U == NULL )
+                {
+                    if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
+                    {
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
+                    }
+                }
+            }
 
+            #pragma omp barrier
 
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
-{
-    int i, pj, j, si, ei;
-    real val;
+            /* construct D^{-1}_U */
+            if ( fresh_pre == TRUE )
+            {
+                #pragma omp for schedule(static)
+                for ( i = 0; i < workspace->U->n; ++i )
+                {
+                    si = workspace->U->start[i];
+                    Dinv_U[i] = 1. / workspace->U->val[si];
+                }
+            }
 
-    for( i = U->n-1; i >= 0; --i ) {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->start[i+1];
-        for( pj = si+1; pj < ei; ++pj ){
-            j = U->entries[pj].j;
-            val = U->entries[pj].val;
-            x[i] -= val * x[j];
+            jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
         }
-        x[i] /= U->entries[si].val;
+        break;
+    default:
+        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
     }
+
+    return;
 }
 
 
-int GMRES( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system* system)
+/* generalized minimual residual iterative solver for sparse linear systems */
+int GMRES( const static_storage * const workspace, const control_params * const control,
+           simulation_data * const data, const sparse_matrix * const H,
+           const real * const b, const real tol, real * const x,
+           const FILE * const fout, const int fresh_pre )
 {
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
+    int i, j, k, itr, N, g_j, g_itr;
+    real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
 
     N = H->n;
-    bnorm = Norm( b, N );
-
-    /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
-
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
 
-        for( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
+    #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \
+        shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
+    {
+        #pragma omp master
+        {
+            time_start = Get_Time( );
+        }
+        bnorm = Norm( b, N );
+        #pragma omp master
+        {
+            data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+        }
 
+        if ( control->pre_comp_type == DIAG_PC )
+        {
+            /* apply preconditioner to RHS */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre );
+            #pragma omp master
+            {
+                data->timing.pre_app += Get_Timing_Info( time_start );
+            }
+        }
 
-        Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N );
-        Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system->N)], N );
+        /* GMRES outer-loop */
+        for ( itr = 0; itr < MAX_ITR; ++itr )
+        {
+            /* calculate r0 */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            Sparse_MatVec( H, x, workspace->b_prm );
+            #pragma omp master
+            {
+                data->timing.solver_spmv += Get_Timing_Info( time_start );
+            }
 
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+            if ( control->pre_comp_type == DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
+            }
 
-            for( k = 0; k < N; ++k )  
-                workspace->v[ index_wkspace_sys (j+1,k,system->N)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
+            if ( control->pre_comp_type == DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Vector_Sum( workspace->v, 1., workspace->b_prc, -1., workspace->b_prm, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+            }
+            else
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Vector_Sum( workspace->v, 1., b, -1., workspace->b_prm, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+            }
 
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
-                workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                        -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            if ( control->pre_comp_type != DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control, workspace->v, workspace->v,
+                        itr == 0 ? fresh_pre : FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
             }
 
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            ret_temp = Norm( workspace->v, N );
+            #pragma omp single
+            {
+                workspace->g[0] = ret_temp;
+            }
+            Vector_Scale( workspace->v, 1. / workspace->g[0], workspace->v, N );
+            #pragma omp master
+            {
+                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+            }
 
-            workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+            /* GMRES inner-loop */
+            for ( j = 0; j < RESTART && FABS(workspace->g[j]) / bnorm > tol; j++ )
+            {
+                /* matvec */
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Sparse_MatVec( H, workspace->v + j * N, workspace->v + (j + 1) * N );
+                #pragma omp master
+                {
+                    data->timing.solver_spmv += Get_Timing_Info( time_start );
+                }
 
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control,
+                        workspace->v + (j + 1) * N, workspace->v + (j + 1) * N, FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
 
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i <= j; i++ )    {
-                if( i == j ) {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                if ( control->pre_comp_type == DIAG_PC )
+                {
+                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                    #pragma omp master
+                    {
+                        time_start = Get_Time( );
+                    }
+                    for ( i = 0; i <= j; i++ )
+                    {
+                        workspace->h[(RESTART + 1) * i + j] =
+                            Dot( workspace->v + i * N, workspace->v + (j + 1) * N, N );
+                        Vector_Add( workspace->v + (j + 1) * N, -workspace->h[(RESTART + 1) * i + j],
+                                workspace->v + i * N, N );
+                    }
+                    #pragma omp master
+                    {
+                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    }
+                }
+                else
+                {
+                    //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
+                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                    #pragma omp master
+                    {
+                        time_start = Get_Time( );
+                        for ( i = 0; i < j - 1; i++ )
+                        {
+                            workspace->h[(RESTART + 1) * i + j] = 0;
+                        }
+                    }
+
+                    for ( i = MAX(j - 1, 0); i <= j; i++ )
+                    {
+                        ret_temp = Dot( workspace->v + i * N, workspace->v + (j + 1) * N, N );
+                        #pragma omp single
+                        {
+                            workspace->h[(RESTART + 1) * i + j] = ret_temp;
+                        }
+                        Vector_Add( workspace->v + (j + 1) * N,
+                                -workspace->h[(RESTART + 1) * i + j], workspace->v + i * N, N );
+                    }
+                    #pragma omp master
+                    {
+                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    }
                 }
 
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                ret_temp = Norm( workspace->v + (j + 1) * N, N );
+                #pragma omp single
+                {
+                    workspace->h[(RESTART + 1) * (j + 1) + j] = ret_temp;
+                }
+                Vector_Scale( workspace->v + (j + 1) * N,
+                              1. / workspace->h[(RESTART + 1) * (j + 1) + j],
+                              workspace->v + (j + 1) * N, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+#if defined(DEBUG)
+                fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+#endif
 
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                    if ( control->pre_comp_type == DIAG_PC )
+                    {
+                        /* Givens rotations on the upper-Hessenberg matrix to make it U */
+                        for ( i = 0; i <= j; i++ )
+                        {
+                            if ( i == j )
+                            {
+                                cc = SQRT( SQR(workspace->h[(RESTART + 1) * j + j])
+                                        + SQR(workspace->h[(RESTART + 1) * (j + 1) + j]) );
+                                workspace->hc[j] = workspace->h[(RESTART + 1) * j + j] / cc;
+                                workspace->hs[j] = workspace->h[(RESTART + 1) * (j + 1) + j] / cc;
+                            }
+
+                            tmp1 =  workspace->hc[i] * workspace->h[(RESTART + 1) * i + j] +
+                                workspace->hs[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+                            tmp2 = -workspace->hs[i] * workspace->h[(RESTART + 1) * i + j] +
+                                workspace->hc[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+
+                            workspace->h[(RESTART + 1) * i + j] = tmp1;
+                            workspace->h[(RESTART + 1) * (i + 1) + j] = tmp2;
+                        }
+                    }
+                    else
+                    {
+                        //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
+                        /* Givens rotations on the upper-Hessenberg matrix to make it U */
+                        for ( i = MAX(j - 1, 0); i <= j; i++ )
+                        {
+                            if ( i == j )
+                            {
+                                cc = SQRT( SQR(workspace->h[(RESTART + 1) * j + j])
+                                        + SQR(workspace->h[(RESTART + 1) * (j + 1) + j]) );
+                                workspace->hc[j] = workspace->h[(RESTART + 1) * j + j] / cc;
+                                workspace->hs[j] = workspace->h[(RESTART + 1) * (j + 1) + j] / cc;
+                            }
+
+                            tmp1 =  workspace->hc[i] * workspace->h[(RESTART + 1) * i + j] +
+                                    workspace->hs[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+                            tmp2 = -workspace->hs[i] * workspace->h[(RESTART + 1) * i + j] +
+                                   workspace->hc[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+
+                            workspace->h[(RESTART + 1) * i + j] = tmp1;
+                            workspace->h[(RESTART + 1) * (i + 1) + j] = tmp2;
+                        }
+                    }
+
+                    /* apply Givens rotations to the rhs as well */
+                    tmp1 =  workspace->hc[j] * workspace->g[j];
+                    tmp2 = -workspace->hs[j] * workspace->g[j];
+                    workspace->g[j] = tmp1;
+                    workspace->g[j + 1] = tmp2;
+                    data->timing.solver_orthog += Get_Timing_Info( time_start );
+                }
 
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
+                #pragma omp barrier
 
-            // fprintf( stderr, "h: " );
-            // for( i = 0; i <= j+1; ++i )
-            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            // fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
+                //fprintf( stderr, "h: " );
+                //for( i = 0; i <= j+1; ++i )
+                //fprintf( stderr, "%.6f ", workspace->h[i][j] );
+                //fprintf( stderr, "\n" );
+                //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+            }
 
+            /* solve Hy = g: H is now upper-triangular, do back-substitution */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+                for ( i = j - 1; i >= 0; i-- )
+                {
+                    temp = workspace->g[i];
+                    for ( k = j - 1; k > i; k-- )
+                    {
+                        temp -= workspace->h[(RESTART + 1) * i + k] * workspace->y[k];
+                    }
 
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+                    workspace->y[i] = temp / workspace->h[(RESTART + 1) * i + i];
+                }
+                data->timing.solver_tri_solve += Get_Timing_Info( time_start );
 
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-        }
+                /* update x = x_0 + Vy */
+                time_start = Get_Time( );
+            }
+            Vector_MakeZero( workspace->p, N );
+            for ( i = 0; i < j; i++ )
+            {
+                Vector_Add( workspace->p, workspace->y[i], workspace->v + i * N, N );
+            }
 
+            Vector_Add( x, 1., workspace->p, N );
+            #pragma omp master
+            {
+                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+            }
 
-        /* update x = x_0 + Vy */
-        for( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            /* stopping condition */
+            if ( FABS(workspace->g[j]) / bnorm <= tol )
+            {
+                break;
+            }
+        }
 
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
+        #pragma omp master
+        {
+            g_itr = itr;
+            g_j = j;
+        }
     }
 
     // Sparse_MatVec( H, x, workspace->b_prm );
     // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
     // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
     // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
     // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n",
     //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
+    // data->timing.solver_iters += itr * RESTART + j;
 
-    if( itr >= MAX_ITR ) {
+    if ( g_itr >= MAX_ITR )
+    {
         fprintf( stderr, "GMRES convergence failed\n" );
         // return -1;
-        return itr * (RESTART+1) + j + 1;
+        return g_itr * (RESTART + 1) + g_j + 1;
     }
 
-    return itr * (RESTART+1) + j + 1;
+    return g_itr * (RESTART + 1) + g_j + 1;
 }
 
 
-int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system *system)
+int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control,
+                       simulation_data * const data, const sparse_matrix * const H,
+                       const real * const b, real tol, real * const x,
+                       const FILE * const fout, const int fresh_pre )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART+2][10000], w[RESTART+2];
-    real u[RESTART+2][10000];
+    real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
+    real u[RESTART + 2][10000];
 
     N = H->n;
     bnorm = Norm( b, N );
 
     /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+    for ( i = 0; i < N; ++i )
+    {
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
+    }
 
     // memset( x, 0, sizeof(real) * N );
 
     /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
+    for ( itr = 0; itr < MAX_ITR; ++itr )
+    {
         /* compute z = r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        for( i = 0; i < N; ++i )
+        Sparse_MatVec( H, x, workspace->b_prm );
+        for ( i = 0; i < N; ++i )
+        {
             workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+        }
         Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
 
-        Vector_MakeZero( w, RESTART+1 );
+        Vector_MakeZero( w, RESTART + 1 );
         w[0] = Norm( z[0], N );
 
         Vector_Copy( u[0], z[0], N );
         u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
         Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
 
-        w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
+        w[0] *= ( u[0][0] < 0.0 ?  1 : -1 );
         // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
         /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
+        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+        {
             /* compute v_j */
             Vector_Scale( z[j], -2 * u[j][j], u[j], N );
             z[j][j] += 1.; /* due to e_j */
 
-            for( i = j-1; i >= 0; --i )
-                Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
-
+            for ( i = j - 1; i >= 0; --i )
+            {
+                Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
+            }
 
             /* matvec */
             Sparse_MatVec( H, z[j], v );
 
-            for( k = 0; k < N; ++k )
+            for ( k = 0; k < N; ++k )
+            {
                 v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
+            }
 
-            for( i = 0; i <= j; ++i )
-                Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
-
+            for ( i = 0; i <= j; ++i )
+            {
+                Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
+            }
 
-            if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
+            if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
+            {
                 /* compute the HouseHolder unit vector u_j+1 */
-                for( i = 0; i <= j; ++i )  
-                    u[j+1][i] = 0;
+                for ( i = 0; i <= j; ++i )
+                {
+                    u[j + 1][i] = 0;
+                }
 
-                Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
+                Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
 
-                u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
+                u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
 
-                Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
+                Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
 
                 /* overwrite v with P_m+1 * v */
-                v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
-                Vector_MakeZero( v + (j+2), N - (j+2) );
+                v[j + 1] -= 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
+                Vector_MakeZero( v + (j + 2), N - (j + 2) );
                 // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
             }
 
 
             /* prev Givens rots on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i < j; i++ ) {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
+            for ( i = 0; i < j; i++ )
+            {
+                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
+                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
 
                 v[i]   = tmp1;
-                v[i+1] = tmp2;
+                v[i + 1] = tmp2;
             }
 
             /* apply the new Givens rotation to H and right-hand side */
-            if( fabs(v[j+1]) >= ALMOST_ZERO )    {
-                cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
+            if ( fabs(v[j + 1]) >= ALMOST_ZERO )
+            {
+                cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
                 workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j+1] / cc;
+                workspace->hs[j] = v[j + 1] / cc;
 
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
+                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
+                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
 
                 v[j]   = tmp1;
-                v[j+1] = tmp2;
+                v[j + 1] = tmp2;
 
                 /* Givens rotations to rhs */
                 tmp1 =  workspace->hc[j] * w[j];
                 tmp2 = -workspace->hs[j] * w[j];
                 w[j]   = tmp1;
-                w[j+1] = tmp2;
+                w[j + 1] = tmp2;
             }
 
             /* extend R */
-            for( i = 0; i <= j; ++i )
-                workspace->h[ index_wkspace_res (i,j) ] = v[i];
+            for ( i = 0; i <= j; ++i )
+            {
+                workspace->h[(RESTART + 1) * i + j] = v[i];
+            }
 
 
             // fprintf( stderr, "h:" );
@@ -326,12 +1661,15 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
 
         /* solve Hy = w.
            H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = w[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+        for ( i = j - 1; i >= 0; i-- )
+        {
+            temp = w[i];
+            for ( k = j - 1; k > i; k-- )
+            {
+                temp -= workspace->h[(RESTART + 1) * i + k] * workspace->y[k];
+            }
 
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+            workspace->y[i] = temp / workspace->h[(RESTART + 1) * i + i];
         }
 
         // fprintf( stderr, "y: " );
@@ -345,9 +1683,9 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
         //   {
         //     Vector_Copy( v, z, N );
         //     v[i] += workspace->y[i];
-        //    
+        //
         //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
-        //   }      
+        //   }
         //
         // fprintf( stderr, "\nz: " );
         // for( k = 0; k < N; ++k )
@@ -358,16 +1696,20 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
         //   fprintf( stderr, "%6.2f ", x[i] );
 
         // Vector_Add( x, 1, z, N );
-        for( i = j-1; i >= 0; i-- )
+        for ( i = j - 1; i >= 0; i-- )
+        {
             Vector_Add( x, workspace->y[i], z[i], N );
+        }
 
         // fprintf( stderr, "\nx_aft: " );
         // for( i = 0; i < N; ++i )
         //   fprintf( stderr, "%6.2f ", x[i] );
 
         /* stopping condition */
-        if( fabs( w[j] ) / bnorm <= tol )
+        if ( fabs( w[j] ) / bnorm <= tol )
+        {
             break;
+        }
     }
 
     // Sparse_MatVec( H, x, workspace->b_prm );
@@ -376,152 +1718,26 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
 
     // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
     // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
     // workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
-    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
+    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n",
     //         itr, j, fabs( workspace->g[j] ) / bnorm );
 
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART+1) + j + 1;
-    }
-
-    return itr * (RESTART+1) + j + 1;
-}
-
-
-int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, 
-        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
-{
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-
-    N = H->n;
-    bnorm = Norm( b, N );
-
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr )
+    if ( itr >= MAX_ITR )
     {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system->N)], 1., b, -1., workspace->b_prm, N );
-        Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
-        Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system->N)], N );
-        Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system->N)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system->N)], N );
-        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
-        {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system->N)], &workspace->v[index_wkspace_sys (j+1,0,system->N)] );
-            Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
-            Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
-
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i < j-1; i++ )
-            {
-                workspace->h[ index_wkspace_res (i,j)] = 0;
-            }
-
-            //for( i = 0; i <= j; i++ ) {
-            for( i = MAX(j-1,0); i <= j; i++ ) {
-                workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
-            }
-
-            workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system->N)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = MAX(j-1,0); i <= j; i++ )
-            {
-                if( i == j )
-                {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
-                    workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
-
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
-
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
-
-            //fprintf( stderr, "h: " );
-            //for( i = 0; i <= j+1; ++i )
-            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            //fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-
-
-        /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- )
-        {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-            {
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-            }
-
-            workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
-        }
-
-        /* update x = x_0 + Vy */
-        Vector_MakeZero( workspace->p, N );
-        for( i = 0; i < j; i++ )
-            Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
-        //Backward_Subs( U, workspace->p, workspace->p );
-        //Forward_Subs( L, workspace->p, workspace->p );
-        Vector_Add( x, 1., workspace->p, N );
-
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-        {
-            break;
-        }
-    }
-
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
-
-    if( itr >= MAX_ITR ) {
         fprintf( stderr, "GMRES convergence failed\n" );
         // return -1;
-        return itr * (RESTART+1) + j + 1;
+        return itr * (RESTART + 1) + j + 1;
     }
 
-    return itr * (RESTART+1) + j + 1;
+    return itr * (RESTART + 1) + j + 1;
 }
 
 
-int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
-        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
+/* Preconditioned Conjugate Gradient */
+int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
+         sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout )
 {
     int  i, N;
     real tmp, alpha, beta, b_norm, r_norm;
@@ -537,12 +1753,12 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
     //Print_Soln( workspace, x, q, b, N );
     //fprintf( stderr, "res: %.15e\n", r_norm );
 
-    Forward_Subs( L, workspace->r, workspace->d );
-    Backward_Subs( U, workspace->d, workspace->p );
+    tri_solve( L, workspace->r, workspace->d, LOWER );
+    tri_solve( U, workspace->d, workspace->p, UPPER );
     sig_new = Dot( workspace->r, workspace->p, N );
     sig0 = sig_new;
 
-    for( i = 0; i < 200 && r_norm/b_norm > tol; ++i )
+    for ( i = 0; i < 200 && r_norm / b_norm > tol; ++i )
     {
         //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
         Sparse_MatVec( A, workspace->p, workspace->q );
@@ -556,8 +1772,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
         r_norm = Norm(workspace->r, N);
         //fprintf( stderr, "res: %.15e\n", r_norm );
 
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->d );
+        tri_solve( L, workspace->r, workspace->d, LOWER );
+        tri_solve( U, workspace->d, workspace->d, UPPER );
         sig_old = sig_new;
         sig_new = Dot( workspace->r, workspace->d, N );
         beta = sig_new / sig_old;
@@ -565,7 +1781,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
     }
 
     //fprintf( fout, "CG took %d iterations\n", i );
-    if( i >= 200 ) {
+    if ( i >= 200 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
         return i;
     }
@@ -574,8 +1791,9 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
 }
 
 
-int CG( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system *system)
+/* Conjugate Gradient */
+int CG( static_storage *workspace, sparse_matrix *H,
+        real *b, real tol, real *x, FILE *fout )
 {
     int  i, j, N;
     real tmp, alpha, beta, b_norm;
@@ -587,29 +1805,34 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for( j = 0; j < N; ++j )
+    for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig_new = Dot( workspace->r, workspace->d, N );
     sig0 = sig_new;
     //Print_Soln( workspace, x, q, b, N );
-    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
+    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n",
     // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
     //fprintf( stderr, "sig_new: %f\n", sig_new );
 
-    for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
+    for ( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
+    {
         //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
         Sparse_MatVec( H, workspace->d, workspace->q );
         tmp = Dot( workspace->d, workspace->q, N );
         //fprintf( stderr, "tmp: %f\n", tmp );
-        alpha = sig_new / tmp;    
+        alpha = sig_new / tmp;
         Vector_Add( x, alpha, workspace->d, N );
         //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
         //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
 
-        Vector_Add( workspace->r, -alpha, workspace->q, N );    
-        for( j = 0; j < N; ++j )
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
+        for ( j = 0; j < N; ++j )
+        {
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         sig_old = sig_new;
         sig_new = Dot( workspace->r, workspace->p, N );
@@ -620,7 +1843,8 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
     fprintf( stderr, "CG took %d iterations\n", i );
 
-    if( i >= 300 ) {
+    if ( i >= 300 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
         return i;
     }
@@ -630,8 +1854,8 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
 
 /* Steepest Descent */
-int SDM( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout )
+int SDM( static_storage *workspace, sparse_matrix *H,
+         real *b, real tol, real *x, FILE *fout )
 {
     int  i, j, N;
     real tmp, alpha, beta, b_norm;
@@ -643,23 +1867,28 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for( j = 0; j < N; ++j )
+    for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig = Dot( workspace->r, workspace->d, N );
     sig0 = sig;
 
-    for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
+    for ( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i )
+    {
         Sparse_MatVec( H, workspace->d, workspace->q );
 
         sig = Dot( workspace->r, workspace->d, N );
         tmp = Dot( workspace->d, workspace->q, N );
-        alpha = sig / tmp;    
+        alpha = sig / tmp;
 
         Vector_Add( x, alpha, workspace->d, N );
         Vector_Add( workspace->r, -alpha, workspace->q, N );
-        for( j = 0; j < N; ++j )
+        for ( j = 0; j < N; ++j )
+        {
             workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
         //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
@@ -667,10 +1896,55 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 
     fprintf( stderr, "SDM took %d iterations\n", i );
 
-    if( i >= 300 ) {
+    if ( i >= 300 )
+    {
         fprintf( stderr, "SDM convergence failed!\n" );
         return i;
     }
 
     return i;
 }
+
+
+/* Estimate the stability of a 2-side preconditioning scheme
+ * using the factorization A \approx LU. Specifically, estimate the 1-norm of A^{-1}
+ * using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
+ *   1) Ly = e
+ *   2) Ux = y where y = Ux
+ * That is, we seek to solve e = LUx for unknown x
+ *
+ * Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
+ *   for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
+real condest( const sparse_matrix * const L, const sparse_matrix * const U )
+{
+    unsigned int i, N;
+    real *e, c;
+
+    N = L->n;
+
+    if ( (e = (real*) malloc(sizeof(real) * N)) == NULL )
+    {
+        fprintf( stderr, "Not enough memory for condest. Terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    memset( e, 1., N * sizeof(real) );
+
+    tri_solve( L, e, e, LOWER );
+    tri_solve( U, e, e, UPPER );
+
+    /* compute 1-norm of vector e */
+    c = FABS(e[0]);
+    for ( i = 1; i < N; ++i)
+    {
+        if ( FABS(e[i]) > c )
+        {
+            c = FABS(e[i]);
+        }
+
+    }
+
+    free( e );
+
+    return c;
+}
diff --git a/PuReMD-GPU/src/lin_alg.h b/PuReMD-GPU/src/lin_alg.h
index a515a959494a6eca40fe9f338d2a08118ff3e39a..317afbf94cf2a26a4f48be4a0ad9c66bcef42085 100644
--- a/PuReMD-GPU/src/lin_alg.h
+++ b/PuReMD-GPU/src/lin_alg.h
@@ -21,28 +21,31 @@
 #ifndef __LIN_ALG_H_
 #define __LIN_ALG_H_
 
-#define SIGN(x) (x < 0.0 ? -1 : 1);
-
 #include "mytypes.h"
 
 
-int GMRES( static_storage*, sparse_matrix*,
-           real*, real, real*, FILE* , reax_system* );
+void Transpose( const sparse_matrix const *, sparse_matrix const * );
+void Transpose_I( sparse_matrix * const );
 
-int GMRES_HouseHolder( static_storage*, sparse_matrix*,
-                       real*, real, real*, FILE* , reax_system*  );
+sparse_matrix * setup_graph_coloring( sparse_matrix * const );
 
-int PGMRES( static_storage*, sparse_matrix*, real*, real,
-            sparse_matrix*, sparse_matrix*, real*, FILE*, reax_system* );
+int GMRES( const static_storage * const, const control_params * const,
+        simulation_data * const, const sparse_matrix * const,
+        const real * const, const real, real * const,
+        const FILE * const, const int );
 
-int PCG( static_storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, FILE*, reax_system* );
+int GMRES_HouseHolder( const static_storage * const, const control_params * const,
+        simulation_data * const, const sparse_matrix * const,
+        const real * const, const real, real * const,
+        const FILE * const, const int );
 
 int CG( static_storage*, sparse_matrix*,
-        real*, real, real*, FILE*, reax_system* );
+        real*, real, real*, FILE* );
+
+int SDM( static_storage*, sparse_matrix*,
+         real*, real, real*, FILE* );
 
-int uyduruk_GMRES( static_storage*, sparse_matrix*,
-                   real*, real, real*, int, FILE*, reax_system* );
+real condest( const sparse_matrix * const, const sparse_matrix * const );
 
 
 #endif
diff --git a/PuReMD-GPU/src/list.c b/PuReMD-GPU/src/list.c
index c6f0e55ebad4fc59c07f253a1d216d3242115aff..c52a4cc1cf2b2a8c1d32fdda71c8b0aa7808992a 100644
--- a/PuReMD-GPU/src/list.c
+++ b/PuReMD-GPU/src/list.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,9 +22,9 @@
 #include "list.h"
 
 
-char Make_List(int n, int num_intrs, int type, list* l)
+int Make_List( int n, int num_intrs, int type, list* l )
 {
-    char success=1;
+    int ret = SUCCESS;
 
     l->n = n;
     l->num_intrs = num_intrs;
@@ -31,116 +32,170 @@ char Make_List(int n, int num_intrs, int type, list* l)
     l->index = (int*) malloc( n * sizeof(int) );
     l->end_index = (int*) malloc( n * sizeof(int) );
 
-    if (l->index == NULL) success = 0;
-    if (l->end_index == NULL) success = 0;
+    if (l->index == NULL)
+    {
+        ret = FAILURE;
+    }
+    if (l->end_index == NULL)
+    {
+        ret = FAILURE;
+    }
 
     l->type = type;
 
-    switch(type)
+    switch (type)
     {
-        case TYP_VOID:
-            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-            if (l->select.v == NULL) success = 0;
-            break;
-
-        case TYP_THREE_BODY:
-            l->select.three_body_list = (three_body_interaction_data*) 
-                malloc(l->num_intrs*sizeof(three_body_interaction_data));
-            if (l->select.three_body_list == NULL) success = 0;
-            break;
-
-        case TYP_BOND:
-            l->select.bond_list = (bond_data*) 
+    case TYP_VOID:
+        l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
+        if (l->select.v == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_THREE_BODY:
+        l->select.three_body_list = (three_body_interaction_data*)
+                malloc(l->num_intrs * sizeof(three_body_interaction_data));
+        if (l->select.three_body_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_BOND:
+        l->select.bond_list = (bond_data*)
                 malloc(l->num_intrs * sizeof(bond_data));
-            if (l->select.bond_list == NULL) success = 0;
-            break;
-
-        case TYP_DBO:
-            l->select.dbo_list = (dbond_data*) 
+        if (l->select.bond_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_DBO:
+        l->select.dbo_list = (dbond_data*)
                 malloc(l->num_intrs * sizeof(dbond_data));
-            if (l->select.dbo_list == NULL) success = 0;
-            break;
-
-        case TYP_DDELTA:
-            l->select.dDelta_list = (dDelta_data*) 
-                malloc(l->num_intrs*sizeof(dDelta_data));
-            if (l->select.dDelta_list == NULL) success = 0;
-            break;
-
-        case TYP_FAR_NEIGHBOR:
-            l->select.far_nbr_list = (far_neighbor_data*) 
-                malloc(l->num_intrs*sizeof(far_neighbor_data));
-            if (l->select.far_nbr_list == NULL) success = 0;
-            break;
-
-        case TYP_NEAR_NEIGHBOR:
-            l->select.near_nbr_list = (near_neighbor_data*) 
-                malloc(l->num_intrs*sizeof(near_neighbor_data));
-            if (l->select.near_nbr_list == NULL) success = 0;
-            break;
-
-        case TYP_HBOND:
-            l->select.hbond_list = (hbond_data*)
+        if (l->select.dbo_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_DDELTA:
+        l->select.dDelta_list = (dDelta_data*)
+                malloc(l->num_intrs * sizeof(dDelta_data));
+        if (l->select.dDelta_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_FAR_NEIGHBOR:
+        l->select.far_nbr_list = (far_neighbor_data*)
+                malloc(l->num_intrs * sizeof(far_neighbor_data));
+        if (l->select.far_nbr_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_NEAR_NEIGHBOR:
+        l->select.near_nbr_list = (near_neighbor_data*)
+                malloc(l->num_intrs * sizeof(near_neighbor_data));
+        if (l->select.near_nbr_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_HBOND:
+        l->select.hbond_list = (hbond_data*)
                 malloc( l->num_intrs * sizeof(hbond_data) );
-            if (l->select.hbond_list == NULL) success = 0;
-            break;            
-
-        default:
-            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-            if (l->select.v == NULL) success = 0;
-            l->type = TYP_VOID;
-            break;      
+        if (l->select.hbond_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    default:
+        l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
+        if (l->select.v == NULL)
+        {
+            ret = FAILURE;
+        }
+        l->type = TYP_VOID;
+        break;
     }
 
-    return success;
+    return ret;
 }
 
 
-void Delete_List(list* l)
+void Delete_List( list* l )
 {
-    if( l->index != NULL )
+    if ( l->index != NULL )
+    {
         free(l->index);
-    if( l->end_index != NULL )
+    }
+    if ( l->end_index != NULL )
+    {
         free(l->end_index);
+    }
 
-    switch(l->type)
+    switch (l->type)
     {
-        case TYP_VOID:
-            if( l->select.v != NULL )
-                free(l->select.v);
-            break;
-        case TYP_THREE_BODY:
-            if( l->select.three_body_list != NULL )
-                free(l->select.three_body_list);
-            break;
-        case TYP_BOND:
-            if( l->select.bond_list != NULL )
-                free(l->select.bond_list);
-            break;
-        case TYP_DBO:
-            if( l->select.dbo_list != NULL )
-                free(l->select.dbo_list);
-            break;
-        case TYP_DDELTA:
-            if( l->select.dDelta_list != NULL )
-                free(l->select.dDelta_list);
-            break;
-        case TYP_FAR_NEIGHBOR:
-            if( l->select.far_nbr_list != NULL )
-                free(l->select.far_nbr_list);
-            break;
-        case TYP_NEAR_NEIGHBOR:
-            if( l->select.near_nbr_list != NULL )
-                free(l->select.near_nbr_list);
-            break;
-        case TYP_HBOND:
-            if( l->select.hbond_list != NULL )
-                free(l->select.hbond_list);
-            break;
-
-        default:
-            // Report fatal error
-            break;
+    case TYP_VOID:
+        if ( l->select.v != NULL )
+        {
+            free(l->select.v);
+        }
+        break;
+    case TYP_THREE_BODY:
+        if ( l->select.three_body_list != NULL )
+        {
+            free(l->select.three_body_list);
+        }
+        break;
+    case TYP_BOND:
+        if ( l->select.bond_list != NULL )
+        {
+            free(l->select.bond_list);
+        }
+        break;
+    case TYP_DBO:
+        if ( l->select.dbo_list != NULL )
+        {
+            free(l->select.dbo_list);
+        }
+        break;
+    case TYP_DDELTA:
+        if ( l->select.dDelta_list != NULL )
+        {
+            free(l->select.dDelta_list);
+        }
+        break;
+    case TYP_FAR_NEIGHBOR:
+        if ( l->select.far_nbr_list != NULL )
+        {
+            free(l->select.far_nbr_list);
+        }
+        break;
+    case TYP_NEAR_NEIGHBOR:
+        if ( l->select.near_nbr_list != NULL )
+        {
+            free(l->select.near_nbr_list);
+        }
+        break;
+    case TYP_HBOND:
+        if ( l->select.hbond_list != NULL )
+        {
+            free(l->select.hbond_list);
+        }
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized list type. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+        break;
     }
-}
 
+}
diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h
index b90c41419271ca6b859be08ea4005fbe9107c029..5ee4544212218488e6fa84477f8a446f66e73544 100644
--- a/PuReMD-GPU/src/list.h
+++ b/PuReMD-GPU/src/list.h
@@ -24,7 +24,7 @@
 #include "mytypes.h"
 
 
-char Make_List( int, int, int, list* );
+int Make_List( int, int, int, list* );
 void Delete_List( list* );
 
 
diff --git a/PuReMD-GPU/src/lookup.c b/PuReMD-GPU/src/lookup.c
index c439709dc09c77775ed716a39db797fa8c831585..b67bf5b7b96e91562a34ab2af3bbe421a1f5c19c 100644
--- a/PuReMD-GPU/src/lookup.c
+++ b/PuReMD-GPU/src/lookup.c
@@ -1,28 +1,28 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "lookup.h"
 
-#include "two_body_interactions.h"
-
 #include "index_utils.h"
+#include "two_body_interactions.h"
 
 
 void Make_Lookup_Table(real xmin, real xmax, int n,
@@ -33,44 +33,48 @@ void Make_Lookup_Table(real xmin, real xmax, int n,
     t->xmin = xmin;
     t->xmax = xmax;
     t->n = n;
-    t->dx = (xmax - xmin)/(n-1);
+    t->dx = (xmax - xmin) / (n - 1);
     t->inv_dx = 1.0 / t->dx;
-    t->a = (n-1)/(xmax-xmin);
-    t->y = (real*) malloc(n*sizeof(real));
+    t->a = (n - 1) / (xmax - xmin);
+    t->y = (real*) malloc(n * sizeof(real));
 
-    for(i=0; i < n; i++)
-        t->y[i] = f(i*t->dx + t->xmin);
+    for (i = 0; i < n; i++)
+        t->y[i] = f(i * t->dx + t->xmin);
 
-    // //fprintf(stdout,"dx = %lf\n",t->dx);
+    // fprintf(stdout,"dx = %lf\n",t->dx);
     // for(i=0; i < n; i++)
-    //   //fprintf( stdout,"%d %lf %lf %lf\n", 
+    //   fprintf( stdout,"%d %lf %lf %lf\n",
     //            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
 }
 
 
 /* Fills solution into x. Warning: will modify c and d! */
 void Tridiagonal_Solve( const real *a, const real *b,
-        real *c, real *d, real *x, unsigned int n){
+        real *c, real *d, real *x, unsigned int n)
+{
     int i;
     real id;
 
     /* Modify the coefficients. */
-    c[0] /= b[0];    /* Division by zero risk. */
-    d[0] /= b[0];    /* Division by zero would imply a singular matrix. */
-    for(i = 1; i < n; i++){
-        id = (b[i] - c[i-1] * a[i]);  /* Division by zero risk. */
-        c[i] /= id;            /* Last value calculated is redundant. */
-        d[i] = (d[i] - d[i-1] * a[i])/id;
+    c[0] /= b[0]; /* Division by zero risk. */
+    d[0] /= b[0]; /* Division by zero would imply a singular matrix. */
+    for (i = 1; i < n; i++)
+    {
+        id = (b[i] - c[i - 1] * a[i]); /* Division by zero risk. */
+        c[i] /= id;         /* Last value calculated is redundant. */
+        d[i] = (d[i] - d[i - 1] * a[i]) / id;
     }
 
     /* Now back substitute. */
     x[n - 1] = d[n - 1];
-    for(i = n - 2; i >= 0; i--)
+    for (i = n - 2; i >= 0; i--)
+    {
         x[i] = d[i] - c[i] * x[i + 1];
+    }
 }
 
 
-void Natural_Cubic_Spline( const real *h, const real *f, 
+void Natural_Cubic_Spline( const real *h, const real *f,
         cubic_spline_coef *coef, unsigned int n )
 {
     int i;
@@ -84,43 +88,53 @@ void Natural_Cubic_Spline( const real *h, const real *f,
     v = (real*) malloc( n * sizeof(real) );
 
     /* build the linear system */
-    a[0] = a[1] = a[n-1] = 0;
-    for( i = 2; i < n-1; ++i )
-        a[i] = h[i-1];
+    a[0] = a[1] = a[n - 1] = 0;
+    for ( i = 2; i < n - 1; ++i )
+    {
+        a[i] = h[i - 1];
+    }
 
-    b[0] = b[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
+    b[0] = b[n - 1] = 0;
+    for ( i = 1; i < n - 1; ++i )
+    {
+        b[i] = 2 * (h[i - 1] + h[i]);
+    }
 
-    c[0] = c[n-2] = c[n-1] = 0;
-    for( i = 1; i < n-2; ++i )
+    c[0] = c[n - 2] = c[n - 1] = 0;
+    for ( i = 1; i < n - 2; ++i )
+    {
         c[i] = h[i];
+    }
 
-    d[0] = d[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    d[0] = d[n - 1] = 0;
+    for ( i = 1; i < n - 1; ++i )
+    {
+        d[i] = 6 * ((f[i + 1] - f[i]) / h[i] - (f[i] - f[i - 1]) / h[i - 1]);
+    }
 
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
+    /*fprintf( stderr, "i  a        b        c        d\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+      fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
     v[0] = 0;
-    v[n-1] = 0;
-    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
+    v[n - 1] = 0;
+    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n - 2 );
+
+    for ( i = 1; i < n; ++i )
+    {
+        coef[i - 1].d = (v[i] - v[i - 1]) / (6 * h[i - 1]);
+        coef[i - 1].c = v[i] / 2;
+        coef[i - 1].b = (f[i] - f[i - 1]) / h[i - 1] + h[i - 1] * (2 * v[i] + v[i - 1]) / 6;
+        coef[i - 1].a = f[i];
     }
 
-    /*//fprintf( stderr, "i  v  coef\n" );
+    /*fprintf( stderr, "i  v  coef\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+      fprintf( stderr, "%d  %f  %f  %f  %f  %f\n",
+      i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
+
 void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
         cubic_spline_coef *coef, unsigned int n )
 {
@@ -136,39 +150,48 @@ void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
 
     /* build the linear system */
     a[0] = 0;
-    for( i = 1; i < n; ++i )
-        a[i] = h[i-1];
+    for ( i = 1; i < n; ++i )
+    {
+        a[i] = h[i - 1];
+    }
 
-    b[0] = 2*h[0];
-    for( i = 1; i < n; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
+    b[0] = 2 * h[0];
+    for ( i = 1; i < n; ++i )
+    {
+        b[i] = 2 * (h[i - 1] + h[i]);
+    }
 
-    c[n-1] = 0;
-    for( i = 0; i < n-1; ++i )
+    c[n - 1] = 0;
+    for ( i = 0; i < n - 1; ++i )
+    {
         c[i] = h[i];
+    }
 
-    d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
-    d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    d[0] = 6 * (f[1] - f[0]) / h[0] - 6 * v0;
+    d[n - 1] = 6 * vlast - 6 * (f[n - 1] - f[n - 2] / h[n - 2]);
+    for ( i = 1; i < n - 1; ++i )
+    {
+        d[i] = 6 * ((f[i + 1] - f[i]) / h[i] - (f[i] - f[i - 1]) / h[i - 1]);
+    }
 
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
+    /*fprintf( stderr, "i  a        b        c        d\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+      fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
     Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
     // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
 
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
+    for ( i = 1; i < n; ++i )
+    {
+        coef[i - 1].d = (v[i] - v[i - 1]) / (6 * h[i - 1]);
+        coef[i - 1].c = v[i] / 2;
+        coef[i - 1].b = (f[i] - f[i - 1]) / h[i - 1] + h[i - 1] * (2 * v[i] + v[i - 1]) / 6;
+        coef[i - 1].a = f[i];
     }
 
-    /*//fprintf( stderr, "i  v  coef\n" );
+    /*fprintf( stderr, "i  v  coef\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+      fprintf( stderr, "%d  %f  %f  %f  %f  %f\n",
+      i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
@@ -178,21 +201,24 @@ void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
     real base, dif;
 
     i = (int)(r * t->inv_dx);
-    if( i == 0 )  ++i;
-    base = (real)(i+1) * t->dx;
+    if ( i == 0 )
+    {
+        ++i;
+    }
+    base = (real)(i + 1) * t->dx;
     dif = r - base;
-    ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
+    //fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
 
-    y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
-        t->vdW[i].a;
-    y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
-            t->CEvd[i].b)*dif + t->CEvd[i].a;
+    y->e_vdW = ((t->vdW[i].d * dif + t->vdW[i].c) * dif + t->vdW[i].b) * dif +
+               t->vdW[i].a;
+    y->CEvd = ((t->CEvd[i].d * dif + t->CEvd[i].c) * dif +
+               t->CEvd[i].b) * dif + t->CEvd[i].a;
     //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
 
-    y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
-        t->ele[i].a;
-    y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
-        t->CEclmb[i].a;
+    y->e_ele = ((t->ele[i].d * dif + t->ele[i].c) * dif + t->ele[i].b) * dif +
+               t->ele[i].a;
+    y->CEclmb = ((t->CEclmb[i].d * dif + t->CEclmb[i].c) * dif + t->CEclmb[i].b) * dif +
+                t->CEclmb[i].a;
 
     y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
     //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
@@ -221,147 +247,158 @@ void Make_LR_Lookup_Table( reax_system *system, control_params *control )
 
     num_atom_types = system->reaxprm.num_atom_types;
     dr = control->r_cut / control->tabulate;
-    h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-
-    /* allocate Long-Range LookUp Table space based on 
+    h = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fh = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fvdw = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fCEvd = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fele = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fCEclmb = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+
+    /* allocate Long-Range LookUp Table space based on
        number of atom types in the ffield file */
-    //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
-    //for( i = 0; i < num_atom_types; ++i )
-    // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
-
-    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
+    LR = (LR_lookup_table*) malloc( num_atom_types * num_atom_types * sizeof(LR_lookup_table) );
 
     /* most atom types in ffield file will not exist in the current
        simulation. to avoid unnecessary lookup table space, determine
        the atom types that exist in the current simulation */
-    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+    for ( i = 0; i < MAX_ATOM_TYPES; ++i )
+    {
         existing_types[i] = 0;
-    for( i = 0; i < system->N; ++i )
+    }
+    for ( i = 0; i < system->N; ++i )
+    {
         existing_types[ system->atoms[i].type ] = 1;
+    }
 
     /* fill in the lookup table entries for existing atom types.
        only lower half should be enough. */
-    for( i = 0; i < num_atom_types; ++i )
-        if( existing_types[i] )
-            for( j = i; j < num_atom_types; ++j )
-                if( existing_types[j] ) {
-                    LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
-                    LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
-                    LR[ index_lr (i,j,num_atom_types) ].dx = dr;
-                    LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
-                    LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-
-                    for( r = 1; r <= control->tabulate; ++r ) {
-                        LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
-                        h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
-                        fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
-                        fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
-                        fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                        fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
-                        fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-
-                        if( r == 1 ){
-                            v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+    for ( i = 0; i < num_atom_types; ++i )
+    {
+        if ( existing_types[i] )
+        {
+            for ( j = i; j < num_atom_types; ++j )
+            {
+                if ( existing_types[j] )
+                {
+                    LR[ index_lr(i,j,num_atom_types) ].xmin = 0;
+                    LR[ index_lr(i,j,num_atom_types) ].xmax = control->r_cut;
+                    LR[ index_lr(i,j,num_atom_types) ].n = control->tabulate + 1;
+                    LR[ index_lr(i,j,num_atom_types) ].dx = dr;
+                    LR[ index_lr(i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
+                    LR[ index_lr(i,j,num_atom_types) ].y = (LR_data*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(LR_data) );
+                    LR[ index_lr(i,j,num_atom_types) ].H = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].vdW = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].CEvd = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].ele = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+
+                    for ( r = 1; r <= control->tabulate; ++r )
+                    {
+                        LR_vdW_Coulomb( system, control, i, j, r * dr,
+                                &(LR[ index_lr(i,j,num_atom_types) ].y[r]) );
+                        h[r] = LR[ index_lr(i,j,num_atom_types) ].dx;
+                        fh[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].H;
+                        fvdw[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].e_vdW;
+                        fCEvd[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                        fele[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].e_ele;
+                        fCEclmb[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
+
+                        if ( r == 1 )
+                        {
+                            v0_vdw = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                            v0_ele = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
                         }
-                        else if( r == control->tabulate ){
-                            vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        else if ( r == control->tabulate )
+                        {
+                            vlast_vdw = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                            vlast_ele = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
                         }
                     }
 
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
-                    Natural_Cubic_Spline( &h[1], &fh[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
+                    Natural_Cubic_Spline( &h[1], &fh[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].H[1]), control->tabulate + 1 );
 
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
-                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
-
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
+                      fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
+                    */
+                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw,
+                            &(LR[ index_lr(i,j,num_atom_types) ].vdW[1]), control->tabulate + 1 );
+                    Natural_Cubic_Spline( &h[1], &fCEvd[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].CEvd[1]), control->tabulate + 1 );
+
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
-                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
+                      fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
+                    */
+                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele,
+                            &(LR[ index_lr(i,j,num_atom_types) ].ele[1]), control->tabulate + 1 );
+                    Natural_Cubic_Spline( &h[1], &fCEclmb[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].CEclmb[1]), control->tabulate + 1 );
                 }
+            }
+        }
+    }
 
     /***** //test LR-Lookup table
-      evdw_maxerr = 0;
-      eele_maxerr = 0;
-      for( i = 0; i < num_atom_types; ++i )
-      if( existing_types[i] )
-      for( j = i; j < num_atom_types; ++j )
-      if( existing_types[j] ) {
-      for( r = 1; r <= 100; ++r ) {
-      rand_dist = (real)rand()/RAND_MAX * control->r_cut;
-      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
-      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
-
-      evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
-      evdw_relerr = fabs(evdw_abserr / y.e_vdW);
-      fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
-      fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
-      eele_abserr = fabs(y.e_ele - y_spline.e_ele);
-      eele_relerr = fabs(eele_abserr / y.e_ele);
-      fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
-      fele_relerr = fabs(fele_abserr / y.CEclmb);
-
-      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
-    //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.H, y_spline.H, 
-    fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );  
-    
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); 
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); 
-    
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); 
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); 
-    }
-    
-    if( evdw_relerr > evdw_maxerr )
-    evdw_maxerr = evdw_relerr;
-    if( eele_relerr > eele_maxerr )
-    eele_maxerr = eele_relerr;
-    }
-    }
-    //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
-    //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
-         *******/
-    
+     evdw_maxerr = 0;
+     eele_maxerr = 0;
+     for( i = 0; i < num_atom_types; ++i )
+     if( existing_types[i] )
+     for( j = i; j < num_atom_types; ++j )
+     if( existing_types[j] ) {
+     for( r = 1; r <= 100; ++r ) {
+     rand_dist = (real)rand()/RAND_MAX * control->r_cut;
+     LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
+     LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
+
+     evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
+     evdw_relerr = fabs(evdw_abserr / y.e_vdW);
+     fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
+     fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
+     eele_abserr = fabs(y.e_ele - y_spline.e_ele);
+     eele_relerr = fabs(eele_abserr / y.e_ele);
+     fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
+     fele_relerr = fabs(fele_abserr / y.CEclmb);
+
+     if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
+     fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.H, y_spline.H,
+     fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );
+
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr );
+
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+             y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr );
+             }
+
+             if( evdw_relerr > evdw_maxerr )
+             evdw_maxerr = evdw_relerr;
+             if( eele_relerr > eele_maxerr )
+             eele_maxerr = eele_relerr;
+             }
+             }
+             fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
+             fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
+    *******/
+
     free(h);
     free(fh);
     free(fvdw);
@@ -383,24 +420,26 @@ real Lookup( real x, lookup_table* t )
     real b;
     int i;
 
-    /* if ( x < t->xmin) 
-       {
-    //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
-    exit(0);
+    /*
+    if ( x < t->xmin)
+    {
+       fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
+       exit(0);
     }
-    if ( x > t->xmax) 
+    if ( x > t->xmax)
     {
-    //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
-    exit(0);
-    } */
+       fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
+       exit(0);
+    }
+    */
 
     i = Lookup_Index_Of( x, t );
     x1 = i * t->dx + t->xmin;
-    x2 = (i+1) * t->dx + t->xmin;
+    x2 = (i + 1) * t->dx + t->xmin;
 
-    b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
-    // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
+    b = ( x2 * t->y[i] - x1 * t->y[i + 1] ) * t->inv_dx;
+    // fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
     //          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
 
-    return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
+    return t->inv_dx * ( t->y[i + 1] - t->y[i] ) * x + b;
 }
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index 0eb1856a578df369aba28d8dbfb55dcf57348e9d..b04a9de39c8dbf487cac6d02a5e098a220862d5a 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -28,12 +28,6 @@
     #define GLOBAL __global__
     #define HOST_DEVICE __host__ __device__
 
-    #include <cuda_runtime.h>
-    #include <cuda.h>
-    #include <cuda_runtime_api.h>
-
-    #include <cublas_v2.h>
-    #include <cusparse_v2.h>
     #if __CUDA_ARCH__ < 600
       #define MYATOMICADD myAtomicAdd
     #else
@@ -55,14 +49,25 @@
   #include "config.h"
 #endif
 
-#include "math.h"
-//#include "random.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "sys/time.h"
-#include "time.h"
-#include "zlib.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+#ifdef _OPENMP
+  #include <omp.h>
+#endif
+
+#ifdef HAVE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+  #include <cuda_runtime_api.h>
+
+  #include <cublas_v2.h>
+  #include <cusparse_v2.h>
+#endif
 
 //#define DEBUG_FOCUS
 //#define TEST_FORCES
@@ -75,6 +80,7 @@
 #define TRUE  1
 #define FALSE 0
 
+#define LOG    log
 #define EXP    exp
 #define SQRT   sqrt
 #define POW    pow
@@ -82,6 +88,8 @@
 #define COS    cos
 #define SIN    sin
 #define TAN    tan
+#define FABS   fabs
+#define FMOD   fmod
 
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
@@ -90,6 +98,15 @@
 #define MAX( x, y )   (((x) > (y)) ? (x) : (y))
 #define MIN( x, y )   (((x) < (y)) ? (x) : (y))
 
+/* NaN IEEE 754 representation for C99 in math.h
+ * Note: function choice must match REAL typedef below */
+#ifdef NAN
+  #define IS_NAN_REAL(a) (isnan(a))
+#else
+  #warn "No support for NaN"
+  #define NAN_REAL(a) (0)
+#endif
+
 #define PI            3.14159265
 #define C_ele          332.06371
 //#define K_B         503.398008   // kcal/mol/K
@@ -106,7 +123,11 @@
 #define AVOGNR          6.0221367e23
 #define P_CONV          1.0e-24 * AVOGNR * JOULES_to_CAL
 
-#define MAX_STR             100      // MAX STRing length (used for naming)
+#define MAX_STR             1024
+#define MAX_LINE            1024
+#define MAX_TOKENS          1024
+#define MAX_TOKEN_LEN       1024
+
 #define MAX_ATOM_ID         100000
 #define MAX_RESTRICT        15
 #define MAX_MOLECULE_SIZE   20
@@ -124,23 +145,7 @@
 #define MAX_ITR             10
 #define RESTART             50
 
-#define FILE_NOT_FOUND_ERR    10
-#define UNKNOWN_ATOM_TYPE_ERR 11
-#define CANNOT_OPEN_OUTFILE   12
-#define INIT_ERR              13
-#define INSUFFICIENT_SPACE    14
-#define UNKNOWN_OPTION        15
-#define INVALID_INPUT         16
-
-#define C_ATOM  0
-#define H_ATOM  1
-#define O_ATOM  2
-#define N_ATOM  3
-#define S_ATOM  4
-#define SI_ATOM 5
-#define GE_ATOM 6
-#define X_ATOM  7
-
+/* tolerance used for validating GPU results against host */
 #define GPU_TOLERANCE   1e-5
 
 #define ZERO           0.000000000000000e+00
@@ -157,6 +162,7 @@
 #define DANGER_ZONE 0.95
 #define LOOSE_ZONE  0.75
 
+//TODO: make enum
 #define RES_GRID_ATOMS      0x01
 #define RES_GRID_TOP        0x02
 #define RES_GRID_MARK       0x03
@@ -165,17 +171,21 @@
 #define RES_GRID_NBRS       0x06
 #define RES_GRID_NBRS_CP    0x07
 
+//TODO: make enum
 #define RES_SYSTEM_ATOMS            0x10
 #define RES_SYSTEM_SIMULATION_BOX   0x11
 
+//TODO: make enum
 #define RES_REAX_INT_SBP    0x20
 #define RES_REAX_INT_TBP    0x21
 #define RES_REAX_INT_THBP   0x22
 #define RES_REAX_INT_HBP    0x23
 #define RES_REAX_INT_FBP    0x24
 
+//TODO: make enum
 #define RES_SIMULATION_DATA 0x30
 
+//TODO: make enum
 #define RES_STORAGE                    0x401
 #define RES_STORAGE_HBOND_INDEX        0x402
 #define RES_STORAGE_TOTAL_BOND_ORDER   0x403
@@ -229,13 +239,17 @@
 #define RES_STORAGE_RESTRICTED_LIST    0x432
 #define RES_STORAGE_ORIG_ID                0x433
 
+//TODO: make enum
 #define RES_CONTROL_PARAMS  0x50
 
+//TODO: make enum
 #define RES_GLOBAL_PARAMS       0x60
 
+//TODO: make enum
 #define RES_SPARSE_MATRIX_INDEX     0x70
 #define RES_SPARSE_MATRIX_ENTRY     0x71
 
+//TODO: make enum
 #define RES_LR_LOOKUP_Y             0x80
 #define RES_LR_LOOKUP_H             0x81
 #define RES_LR_LOOKUP_VDW               0x82
@@ -244,6 +258,7 @@
 #define RES_LR_LOOKUP_CECLMB            0x85
 #define RES_LR_LOOKUP_TABLE         0x86
 
+//TODO: make enum
 #define RES_SCRATCH                     0x90
 
 #define LIST_INDEX                      0x00
@@ -314,17 +329,78 @@ typedef real rvec[3];
 typedef int  ivec[3];
 typedef real rtensor[3][3];
 
-enum {NVE, NVT, NPT, sNPT, iNPT, ensNR, bNVT};
-enum {FAR_NBRS, NEAR_NBRS, THREE_BODIES, BONDS, OLD_BONDS,
-      HBONDS, DBO, DDELTA, LIST_N
-     };
-enum {TYP_VOID, TYP_THREE_BODY, TYP_BOND, TYP_HBOND, TYP_DBO,
-      TYP_DDELTA, TYP_FAR_NEIGHBOR, TYP_NEAR_NEIGHBOR, TYP_N
-     };
-enum {UNKNOWN, WATER};
-enum {NO_ANALYSIS, FRAGMENTS, REACTIONS, NUM_ANALYSIS};
-enum {WRITE_ASCII, WRITE_BINARY, RF_N};
-enum {XYZ, PDB, BGF, ASCII_RESTART, BINARY_RESTART, GF_N};
+/* config params */
+enum ensemble
+{
+    NVE = 0, NVT = 1, NPT = 2, sNPT = 3, iNPT = 4, ensNR = 5, bNVT = 6,
+};
+
+enum interaction_list_offets
+{
+    FAR_NBRS = 0, NEAR_NBRS = 1, THREE_BODIES = 2, BONDS = 3, OLD_BONDS = 4,
+    HBONDS = 5, DBO = 6, DDELTA = 7, LIST_N = 8,
+};
+
+enum interaction_type
+{
+    TYP_VOID = 0, TYP_THREE_BODY = 1, TYP_BOND = 2, TYP_HBOND = 3, TYP_DBO = 4,
+    TYP_DDELTA = 5, TYP_FAR_NEIGHBOR = 6, TYP_NEAR_NEIGHBOR = 7, TYP_N = 8,
+};
+
+enum errors
+{
+    FILE_NOT_FOUND = -10,
+    UNKNOWN_ATOM_TYPE = -11,
+    CANNOT_OPEN_FILE = -12,
+    CANNOT_INITIALIZE = -13,
+    INSUFFICIENT_MEMORY = -14,
+    UNKNOWN_OPTION = -15,
+    INVALID_INPUT = -16,
+    INVALID_GEO = -17,
+    NUMERIC_BREAKDOWN = -18,
+    RUNTIME_ERROR = -19,
+};
+
+enum atoms
+{
+    C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
+    S_ATOM = 4, SI_ATOM = 5, GE_ATOM = 6, X_ATOM = 7,
+};
+
+enum molecule_type
+{
+    UNKNOWN = 0, WATER = 1,
+};
+
+enum molecular_analysis_type
+{
+    NO_ANALYSIS = 0, FRAGMENTS = 1, REACTIONS = 2, NUM_ANALYSIS = 3,
+};
+
+enum restart_format
+{
+    WRITE_ASCII = 0, WRITE_BINARY = 1, RF_N = 2,
+};
+
+enum geo_formats
+{
+    CUSTOM = 0, PDB = 1, BGF = 2, ASCII_RESTART = 3, BINARY_RESTART = 4, GF_N = 5,
+};
+
+enum solver
+{
+    GMRES_S = 0, GMRES_H_S = 1, CG_S = 2, SDM_S = 3,
+};
+
+enum pre_comp
+{
+    DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILUT_PAR_PC = 3, ILU_SUPERLU_MT_PC = 4,
+};
+
+enum pre_app
+{
+    NONE_PA = 0, TRI_SOLVE_PA = 1, TRI_SOLVE_LEVEL_SCHED_PA = 2, TRI_SOLVE_GC_PA = 3, JACOBI_ITER_PA = 4,
+};
 
 
 /* Global params mapping */
@@ -502,33 +578,36 @@ typedef struct
 {
     int num_atom_types;
     global_parameters gp;
-    global_parameters d_gp;
-
     single_body_parameters *sbp;
-    single_body_parameters *d_sbp;
-
     two_body_parameters *tbp;
-    two_body_parameters *d_tbp;
-
     three_body_header *thbp;
-    three_body_header *d_thbp;
-
     hbond_parameters *hbp;
-    hbond_parameters *d_hbp;
-
     four_body_header *fbp;
-    four_body_header *d_fbp;
 
+#ifdef HAVE_CUDA
+    global_parameters d_gp;
+    single_body_parameters *d_sbp;
+    two_body_parameters *d_tbp;
+    three_body_header *d_thbp;
+    hbond_parameters *d_hbp;
+    four_body_header *d_fbp;
+#endif
 } reax_interaction;
 
 
 typedef struct
 {
-    rvec x;        /* Position, velocity, force on atom */
+    /* Position, velocity, force on atom */
+    rvec x;
     rvec v;
     rvec f;
-    real q;              /* Charge on the atom */
-    int  type;           /* Type of this atom */
+
+    /* Charge on the atom */
+    real q;
+
+    /* Type of this atom */
+    int type;
+
     char name[5];
     char spare[7];
 } reax_atom;
@@ -561,9 +640,6 @@ typedef struct
     rvec len;
     rvec inv_len;
 
-    //CUDA
-    int    max_cuda_nbrs; //TODO remove this not used anymore
-
     int   *atoms;
     int   *top;
     int   *mark;
@@ -578,7 +654,16 @@ typedef struct
 {
     int N;
 
-    //CUDA
+    reax_atom *atoms;
+    reax_interaction reaxprm;
+    simulation_box box;
+    grid g;
+
+#ifdef HAVE_CUDA
+    reax_atom *d_atoms;
+    simulation_box *d_box;
+    grid d_g;
+
     //int max_thb_intrs;
     int max_sparse_matrix_entries;
     int num_nbrs;
@@ -586,17 +671,7 @@ typedef struct
     int num_hbonds;
     int num_thbodies;
     int init_thblist;
-
-    reax_atom *atoms;
-    reax_atom *d_atoms;
-
-    reax_interaction reaxprm;
-
-    simulation_box box;
-    simulation_box *d_box;
-
-    grid g;
-    grid d_g;
+#endif
 } reax_system;
 
 
@@ -616,23 +691,22 @@ typedef struct
        2 : NPT  (Parrinello-Rehman-Nose-Hoover) Anisotropic
        3 : sNPT (Parrinello-Rehman-Nose-Hoover) semiisotropic
        4 : iNPT (Parrinello-Rehman-Nose-Hoover) isotropic */
-    int  ensemble;
-    int  nsteps;
-    int  periodic_boundaries;
-    int  restrict_bonds;
-    int  tabulate;
+    int ensemble;
+    int nsteps;
+    int periodic_boundaries;
+    int restrict_bonds;
+    int tabulate;
     ivec periodic_images;
     real dt;
 
     int reneighbor;
     real vlist_cut;
     real nbr_cut;
-    real r_cut, r_low; // upper and lower taper
+    real r_cut, r_sp_cut, r_low; // upper, reduced upper, and lower taper
     real bo_cut;
     real thb_cut;
     real hb_cut;
     real Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
-    real q_err;
     int  max_far_nbrs;
 
     real T_init, T_final, T;
@@ -656,16 +730,26 @@ typedef struct
     int freq_diffusion_coef;
     int restrict_type;
 
-    int refactor;
-    real droptol;
+    unsigned int qeq_solver_type;
+    real qeq_solver_q_err;
+    real qeq_domain_sparsity;
+    unsigned int qeq_domain_sparsify_enabled;
+    unsigned int pre_comp_type;
+    unsigned int pre_comp_refactor;
+    real pre_comp_droptol;
+    unsigned int pre_comp_sweeps;
+    unsigned int pre_app_type;
+    unsigned int pre_app_jacobi_iters;
 
     int molec_anal;
     int freq_molec_anal;
     real bg_cut;
     int num_ignored;
-    int  ignore[MAX_ATOM_TYPES];
+    int ignore[MAX_ATOM_TYPES];
 
+#ifdef HAVE_CUDA
     void *d_control;
+#endif
 } control_params;
 
 
@@ -720,7 +804,14 @@ typedef struct
     real bonded;
     real nonb;
     real QEq;
-    int  matvecs;
+    real QEq_sort_mat_rows;
+    real pre_comp;
+    real pre_app;
+    int solver_iters;
+    real solver_spmv;
+    real solver_vector_ops;
+    real solver_orthog;
+    real solver_tri_solve;
 } reax_timing;
 
 
@@ -776,9 +867,11 @@ typedef struct
     rvec tot_press;
 
     reax_timing timing;
-    //CUDA
+
+#ifdef HAVE_CUDA
     reax_timing d_timing;
     void *d_simulation_data;
+#endif
 } simulation_data;
 
 
@@ -789,8 +882,9 @@ typedef struct
     real theta, cos_theta;
     rvec dcos_di, dcos_dj, dcos_dk;
 
-    //CUDA
+#ifdef HAVE_CUDA
     int i, j, k;
+#endif
 } three_body_interaction_data;
 
 
@@ -813,9 +907,11 @@ typedef struct
     rvec dvec;
     // real H; //, Tap, inv_dr3gamij_1, inv_dr3gamij_3;
 
-    //CUDA
+#ifdef HAVE_CUDA
     //int sym_index;
     //rvec h_f;
+#endif
+
     char spare[16];
 } far_neighbor_data;
 
@@ -868,6 +964,7 @@ typedef struct
     rvec dvec;
     bond_order_data bo_data;
 
+#ifdef HAVE_CUDA
     //single body -- lone pair
     real scratch;
 
@@ -887,42 +984,47 @@ typedef struct
 
     //compute_total_forces
     rvec t_f;
+#endif
 } bond_data;
 
 
+/* compressed row storage (crs) format
+ * See, e.g.,
+ *   http://netlib.org/linalg/html_templates/node91.html#SECTION00931100000000000000
+ *
+ *   m: number of nonzeros (NNZ) ALLOCATED
+ *   n: number of rows
+ *   start: row pointer (last element contains ACTUAL NNZ)
+ *   j: column index for corresponding matrix entry
+ *   val: matrix entry
+ * */
 typedef struct
 {
-    int j;
-    real val;
-} sparse_matrix_entry;
-
-
-typedef struct
-{
-    int n, m;
-    int *start;
-    //CUDA
-    int *end;
-    sparse_matrix_entry *entries;
-
-    int *j;
+    unsigned int n, m;
+    unsigned int *start;
+#ifdef HAVE_CUDA
+    unsigned int *end;
+#endif
+    unsigned int *j;
     real *val;
-
 } sparse_matrix;
 
 
 typedef struct
 {
-    int estimate_nbrs;
     int num_far;
     int Htop;
     int hbonds;
     int num_hbonds;
     int bonds;
     int num_bonds;
-    int thbody;
     int num_3body;
     int gcell_atoms;
+
+#ifdef HAVE_CUDA
+    int estimate_nbrs;
+    int thbody;
+#endif
 } reallocate_data;
 
 
@@ -937,7 +1039,7 @@ typedef struct
     rvec *dDeltap_self;
 
     /* QEq storage */
-    sparse_matrix H, L, U;
+    sparse_matrix *H, *H_sp, *L, *U;
     real *droptol;
     real *w;
     real *Hdia_inv;
@@ -990,6 +1092,7 @@ typedef struct
 } static_storage;
 
 
+/* interaction lists */
 typedef struct
 {
     int n;
@@ -1127,25 +1230,25 @@ typedef void (*evolve_function)(reax_system*, control_params*,
         list**, output_controls*);
 
 typedef real (*lookup_function)(real);
-extern lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
 
+extern lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
 extern LR_lookup_table *LR;
 
-
 typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int*);
+        control_params*, far_neighbor_data*, int*);
+
+extern reax_timing d_timing;
 
-/* CUDA structures */
+#ifdef HAVE_CUDA
 extern list *dev_lists;
 extern static_storage *dev_workspace;
 extern LR_lookup_table *d_LR;
-extern reax_timing d_timing;
 
-//Scratch Pad usage.
+/* scratch Pad usage */
 extern void *scratch;
 extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 extern int MATVEC_BLOCKS;
+#endif
 
 
 #endif
diff --git a/PuReMD-GPU/src/neighbors.c b/PuReMD-GPU/src/neighbors.c
index 5f425e672080d2d4a272f7aca1859c45d8dde17d..7a005f081d57e26cc86bde6501e72bd64d6bc2cc 100644
--- a/PuReMD-GPU/src/neighbors.c
+++ b/PuReMD-GPU/src/neighbors.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -26,51 +27,11 @@
 #include "list.h"
 #include "reset_utils.h"
 #include "system_props.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-        real cutoff, far_neighbor_data *data )
-{
-    real norm_sqr, d, tmp;
-    int i;
-
-    norm_sqr = 0;
-
-    for( i = 0; i < 3; i++ ) { 
-        d = x2[i] - x1[i];
-        tmp = SQR(d);
-
-        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-            if( x2[i] > x1[i] ) { 
-                d -= box->box_norms[i];
-                data->rel_box[i] = -1; 
-            }   
-            else {
-                d += box->box_norms[i];
-                data->rel_box[i] = +1; 
-            }   
-
-            data->dvec[i] = d;
-            norm_sqr += SQR(d);
-        }   
-        else {
-            data->dvec[i] = d;
-            norm_sqr += tmp;
-            data->rel_box[i] = 0;
-        }   
-    }
-
-    if( norm_sqr <= SQR(cutoff) ){
-        data->d = sqrt(norm_sqr);
-        return 1;
-    }
-
-    return 0;
-}
-
-
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -86,54 +47,61 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     far_neighbor_data *nbr_data;
     real t_start, t_elapsed;
 
+    t_start = Get_Time( );
     // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->g );
     far_nbrs = (*lists) + FAR_NBRS;
     Bin_Atoms( system, workspace );
-
-    t_start = Get_Time( );
-
     // fprintf( stderr, "atoms sorted - " );
     num_far = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
+                nbrs = &g->nbrs[ index_grid_nbrs(i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs(i,j,k,0,g) ];
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
+                for(l = 0; l < g->top[ index_grid_3d(i,j,k,g) ]; ++l )
+                {
+                    atom1 = g->atoms[ index_grid_atoms(i,j,k,l,g) ];
                     Set_Start_Index( atom1, num_far, far_nbrs );
                     //fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[ index_grid_3d (x,y,z,g) ];
+                        if ( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                                SQR(control->vlist_cut) )
+                        {
+                            nbr_atoms = &g->atoms[ index_grid_atoms(x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d(x,y,z,g) ];
                             //fprintf( stderr, "\t\tmax: %d\n", max );
 
                             /* pick up another atom from the neighbor cell */
-                            for( m = 0; m < max; ++m ) {
+                            for ( m = 0; m < max; ++m )
+                            {
                                 atom2 = nbr_atoms[m];
-                                if( atom1 > atom2 ) {
+                                if ( atom1 > atom2 )
+                                {
                                     nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                nbr_data)) {
+                                    //fprintf (stderr, " %f %f %f \n", nbr_data->dvec[0], nbr_data->dvec[1], nbr_data->dvec[2]);
+                                    if (Are_Far_Neighbors(system->atoms[atom1].x,
+                                                          system->atoms[atom2].x,
+                                                          &(system->box), control->vlist_cut,
+                                                          nbr_data))
+                                    {
                                         nbr_data->nbr = atom2;
-
                                         ++num_far;
                                     }
                                 }
@@ -144,20 +112,22 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                     }
 
                     Set_End_Index( atom1, num_far, far_nbrs );
-                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
+                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n",
                     //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-                    //  itr); 
+                    //  itr);
                 }
             }
+        }
+    }
 
-    fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
-
-    if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
+    if ( num_far > far_nbrs->num_intrs * DANGER_ZONE )
+    {
         workspace->realloc.num_far = num_far;
-        if( num_far > far_nbrs->num_intrs ){
+        if ( num_far > far_nbrs->num_intrs )
+        {
             fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-                    data->step, num_far, far_nbrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
+                     data->step, num_far, far_nbrs->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
@@ -165,25 +135,24 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     data->timing.nbrs += t_elapsed;
 
 #if defined(DEBUG)
-    for( i = 0; i < system->N; ++i ) {
-        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                compare_far_nbrs ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+               Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+               compare_far_nbrs );
     }
 #endif
-
-#if defined(DEBUG_FOCUS)  
-    //fprintf( stderr, "nbrs - ");
-    //fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nbrs - ");
+    fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 #endif
-
 #if defined(TEST_ENERGY)
     //Print_Far_Neighbors( system, control, workspace, lists );
 #endif
 }
 
 
-int Estimate_NumNeighbors( reax_system *system, control_params *control, 
+int Estimate_NumNeighbors( reax_system *system, control_params *control,
         static_storage *workspace, list **lists )
 {
     int  i, j, k, l, m, itr;
@@ -195,53 +164,63 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control,
     rvec *nbrs_cp;
     grid *g;
     far_neighbor_data nbr_data;
-
+#ifdef HAVE_CUDA
     int start = 0, finish = 0;
+#endif
 
     // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->g );
     Bin_Atoms( system, workspace );
     // fprintf( stderr, "atoms sorted - " );
     num_far = 0;
+#ifdef HAVE_CUDA
     g->max_cuda_nbrs = 0;
+#endif
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
+                nbrs = &g->nbrs[ index_grid_nbrs(i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs(i,j,k,0,g) ];
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
-                    start = num_far;
+                for(l = 0; l < g->top[ index_grid_3d(i,j,k,g) ]; ++l )
+                {
+                    atom1 = g->atoms[ index_grid_atoms(i,j,k,l,g) ];
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[index_grid_3d (x,y,z,g) ];
+                        if ( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                                SQR(control->vlist_cut) )
+                        {
+                            nbr_atoms = &g->atoms[ index_grid_atoms(x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d(x,y,z,g) ];
                             //fprintf( stderr, "\t\tmax: %d\n", max );
 
                             /* pick up another atom from the neighbor cell -
-                               we have to compare atom1 with its own periodic images as well, 
-                               that's why there is also equality in the if stmt below */
-                            for( m = 0; m < max; ++m ) {
+                            we have to compare atom1 with its own periodic images as well,
+                             that's why there is also equality in the if stmt below */
+                            for ( m = 0; m < max; ++m )
+                            {
                                 atom2 = nbr_atoms[m];
                                 //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
-                                if( atom1 > atom2 ) {
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                &nbr_data))
+                                if ( atom1 > atom2 )
+                                {
+                                    if (Are_Far_Neighbors(system->atoms[atom1].x,
+                                                          system->atoms[atom2].x,
+                                                          &(system->box), control->vlist_cut,
+                                                          &nbr_data))
                                         ++num_far;
                                 }
                             }
@@ -250,38 +229,46 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control,
                         ++itr;
                     }
 
-                    // finish note
+#ifdef HAVE_CUDA
                     finish = num_far;
-                    if (g->max_cuda_nbrs <= (finish - start)){
+                    if (g->max_cuda_nbrs <= (finish - start))
+                    {
                         g->max_cuda_nbrs    = finish - start;
                     }
+#endif
                 }
             }
+        }
+    }
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
 #endif
-
     return num_far * SAFE_ZONE;
 }
 
 
-//Code not used anymore
 #if defined DONE
-
-void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
+void Choose_Neighbor_Finder( reax_system *system, control_params *control,
         get_far_neighbors_function *Get_Far_Neighbors )
 {
-    if( control->periodic_boundaries )
+    if ( control->periodic_boundaries )
     {
-        if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
+        if ( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
                 system->box.box_norms[1] > 2.0 * control->vlist_cut &&
                 system->box.box_norms[2] > 2.0 * control->vlist_cut )
+        {
             (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
-        else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+        }
+        else
+        {
+            (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+        }
     }
     else
+    {
         (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
+    }
 }
 
 
@@ -327,18 +314,28 @@ inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
 
     // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
 
-    if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
-        return 1;
+    if ( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
+    {
+        return FALSE;
+    }
 
-    for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
-        if( workspace->restricted_list[ atom1 ][i] == atom2 )
-            return 1;
+    for ( i = 0; i < workspace->restricted[ atom1 ]; ++i )
+    {
+        if ( workspace->restricted_list[ atom1 ][i] == atom2 )
+        {
+            return FALSE;
+        }
+    }
 
-    for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
-        if( workspace->restricted_list[ atom2 ][i] == atom1 )
-            return 1;
+    for ( i = 0; i < workspace->restricted[ atom2 ]; ++i )
+    {
+        if ( workspace->restricted_list[ atom2 ][i] == atom1 )
+        {
+            return FALSE;
+        }
+    }
 
-    return 0;
+    return TRUE;
 }
 
 
@@ -347,17 +344,20 @@ inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
 {
     int i;
 
-    for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
-        if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
+    for ( i = Start_Index(atom1, near_nbrs); i < End_Index(atom1, near_nbrs); ++i )
+    {
+        if ( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
         {
             // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
-            return 1;
+            return FALSE;
         }
+    }
 
-    return 0;
+    return TRUE;
 }
 
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -368,21 +368,20 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     int   num_far;
     int   c, count;
     int   grid_top;
-    grid *g = &( system->g );  
+    grid *g = &( system->g );
     list *far_nbrs = (*lists) + FAR_NBRS;
     //int   hb_type1, hb_type2;
     //list *hbonds = (*lists) + HBOND;
     //int   top_hbond1, top_hbond2;
     get_far_neighbors_function Get_Far_Neighbors;
     far_neighbor_data new_nbrs[125];
-#ifndef REORDER_ATOMS
-    int   l, m;
-#endif
 
     // fprintf( stderr, "\n\tentered nbrs - " );
-    if( control->ensemble == iNPT || control->ensemble == sNPT || 
+    if ( control->ensemble == iNPT || control->ensemble == sNPT ||
             control->ensemble == NPT )
+    {
         Update_Grid( system );
+    }
     // fprintf( stderr, "grid updated - " );
 
     Bin_Atoms( system, out_control );
@@ -394,9 +393,9 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
 #endif
 
     Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-    // fprintf( stderr, "function chosen - " );  
+    // fprintf( stderr, "function chosen - " );
 
-    Reset_Neighbor_Lists( system, workspace, lists );  
+    Reset_Neighbor_Lists( system, workspace, lists );
     // fprintf( stderr, "lists cleared - " );
 
     num_far = 0;
@@ -404,9 +403,12 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     c = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs = g->nbrs[i][j][k];
                 nbrs_cp = g->nbrs_cp[i][j][k];
 
@@ -414,119 +416,137 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                 //#ifdef REORDER_ATOMS
                 //  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
                 //#else
-                for(l = 0; l < g->top[i][j][k]; ++l ){
+                for (l = 0; l < g->top[i][j][k]; ++l )
+                {
                     atom1 = g->atoms[i][j][k][l];
                     Set_End_Index( atom1, num_far, far_nbrs );
                     // fprintf( stderr, "atom %d:\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] > 0 ){
+                    while ( nbrs[itr][0] > 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
 
-                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                        //     SQR(control->r_cut))     
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                        //     SQR(control->r_cut))
                         nbr_atoms = g->atoms[x][y][z];
                         max_atoms = g->top[x][y][z];
 
                         /* pick up another atom from the neighbor cell -
-                           we have to compare atom1 with its own periodic images as well, 
+                           we have to compare atom1 with its own periodic images as well,
                            that's why there is also equality in the if stmt below */
                         //#ifdef REORDER_ATOMS
                         //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
                         //#else
-                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                            if( atom1 >= atom2 ) {
+                        for ( m = 0, atom2 = nbr_atoms[m]; m < max; ++m, atom2 = nbr_atoms[m] )
+                        {
+                            if ( atom1 >= atom2 )
+                            {
                                 //fprintf( stderr, "\tatom2 %d", atom2 );
                                 //top_near1 = End_Index( atom1, near_nbrs );
                                 //Set_Start_Index( atom1, num_far, far_nbrs );
                                 //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
                                 Get_Far_Neighbors( system->atoms[atom1].x,
-                                        system->atoms[atom2].x, 
-                                        &(system->box), control, new_nbrs, &count );
+                                                   system->atoms[atom2].x,
+                                                   &(system->box), control, new_nbrs, &count );
                                 fprintf( stderr, "\t%d count:%d\n", atom2, count );
 
-                                for( c = 0; c < count; ++c )
-                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                for ( c = 0; c < count; ++c )
+                                {
+                                    if (atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d >= 0.1))
+                                    {
                                         Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                atom2, new_nbrs[c].d, 1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                                         atom2, new_nbrs[c].d, 1.0,
+                                                         new_nbrs[c].dvec, new_nbrs[c].rel_box );
                                         ++num_far;
 
                                         /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                          atom1, atom2, new_nbrs[c].d, 
-                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          atom1, atom2, new_nbrs[c].d,
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1],
                                           new_nbrs[c].dvec[2] ); */
 
 
-                                        /* hydrogen bond lists */ 
-                                        /*if( control->hb_cut > 0.1 && 
+                                        /* hydrogen bond lists */
+                                        /*if( control->hb_cut > 0.1 &&
                                           new_nbrs[c].d <= control->hb_cut ) {
-                                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                                        hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
-                                        if( hb_type1 == 1 && hb_type2 == 2 ) {
-                                        top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
-                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
-                                        atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
-                                        new_nbrs[c].rel_box );
-                                        Set_End_Index( workspace->hbond_index[atom1], 
-                                        top_hbond1 + 1, hbonds );
-                                        }
-                                        else if( hb_type1 == 2 && hb_type2 == 1 ) {
-                                        top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
-                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
-                                        atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
-                                        new_nbrs[c].rel_box );
-                                        Set_End_Index( workspace->hbond_index[atom2], 
-                                        top_hbond2 + 1, hbonds );
-                                        }*/
+                                          // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                                          hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
+                                          if( hb_type1 == 1 && hb_type2 == 2 ) {
+                                          top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
+                                          Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
+                                          atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
+                                          new_nbrs[c].rel_box );
+                                          Set_End_Index( workspace->hbond_index[atom1],
+                                          top_hbond1 + 1, hbonds );
+                                          }
+                                          else if( hb_type1 == 2 && hb_type2 == 1 ) {
+                                          top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
+                                          Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
+                                          atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec,
+                                          new_nbrs[c].rel_box );
+                                          Set_End_Index( workspace->hbond_index[atom2],
+                                          top_hbond2 + 1, hbonds );
+                                          }*/
                                     }
                                 }
+                            }
                         }
+                    }
 
                     Set_End_Index( atom1, top_far1, far_nbrs );
                 }
             }
+        }
+    }
 
     fprintf( stderr, "nbrs done-" );
 
+
     /* apply restrictions on near neighbors only */
-    if( (data->step - data->prev_steps) < control->restrict_bonds ) {
-        for( atom1 = 0; atom1 < system->N; ++atom1 )
-            if( workspace->restricted[ atom1 ] ) {
+    if ( (data->step - data->prev_steps) < control->restrict_bonds )
+    {
+        for ( atom1 = 0; atom1 < system->N; ++atom1 )
+        {
+            if ( workspace->restricted[ atom1 ] )
+            {
                 // fprintf( stderr, "atom1: %d\n", atom1 );
 
                 top_near1 = End_Index( atom1, near_nbrs );
 
-                for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
-                    if(!is_Near_Neighbor(near_nbrs, atom1, 
-                                atom2 = workspace->restricted_list[atom1][j])) {
+                for ( j = 0; j < workspace->restricted[ atom1 ]; ++j )
+                {
+                    if (is_Near_Neighbor(near_nbrs, atom1,
+                          atom2 = workspace->restricted_list[atom1][j]) == FALSE)
+                    {
                         fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
-                                atom1, atom2 );
+                                 atom1, atom2 );
 
-                        top_near2 = End_Index( atom2, near_nbrs );          
+                        top_near2 = End_Index( atom2, near_nbrs );
 
-                        /* we just would like to get the nearest image, so a call to 
+                        /* we just would like to get the nearest image, so a call to
                            Get_Periodic_Far_Neighbors_Big_Box is good enough. */
-                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
-                                system->atoms[ atom2 ].x, 
-                                &(system->box), control, 
-                                new_nbrs, &count );
+                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x,
+                                                            system->atoms[ atom2 ].x,
+                                                            &(system->box), control,
+                                                            new_nbrs, &count );
 
                         Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
-                                atom2, new_nbrs[c].d, 1.0, 
-                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                           atom2, new_nbrs[c].d, 1.0,
+                                           new_nbrs[c].dvec, new_nbrs[c].rel_box );
                         ++top_near1;
 
                         Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
-                                atom1, new_nbrs[c].d, -1.0, 
-                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                        Set_End_Index( atom2, top_near2+1, near_nbrs );
+                                           atom1, new_nbrs[c].d, -1.0,
+                                           new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                        Set_End_Index( atom2, top_near2 + 1, near_nbrs );
                     }
+                }
 
                 Set_End_Index( atom1, top_near1, near_nbrs );
             }
+        }
     }
     // fprintf( stderr, "restrictions applied-" );
 
@@ -534,56 +554,61 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     /* verify nbrlists, count num_intrs, sort nearnbrs */
     near_nbrs->num_intrs = 0;
     far_nbrs->num_intrs = 0;
-    for( i = 0; i < system->N-1; ++i ) {
-        if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
-            fprintf( stderr, 
-                    "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
-                    data->step, i+1, i );
-            exit( 1 );
+    for ( i = 0; i < system->N - 1; ++i )
+    {
+        if ( End_Index(i, near_nbrs) > Start_Index(i + 1, near_nbrs) )
+        {
+            fprintf( stderr,
+                     "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
+                     data->step, i + 1, i );
+            exit( RUNTIME_ERROR );
         }
 
         near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
 
-        if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
-            fprintf( stderr, 
-                    "step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
-                    data->step, i+1, i );
-            exit( 1 );
+        if ( End_Index(i, far_nbrs) > Start_Index(i + 1, far_nbrs) )
+        {
+            fprintf( stderr,
+                     "step%3d: farnbr list of atom%d is overwritten by atom%d\n",
+                     data->step, i + 1, i );
+            exit( RUNTIME_ERROR );
         }
 
         far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
     }
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
-                Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
-                compare_near_nbrs );
+               Num_Entries(i, near_nbrs), sizeof(near_neighbor_data),
+               compare_near_nbrs );
     }
     // fprintf( stderr, "near nbrs sorted\n" );
 
+
 #ifdef TEST_ENERGY
     /* for( i = 0; i < system->N; ++i ) {
-       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-       compare_far_nbrs ); 
+       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+       compare_far_nbrs );
        } */
 
-    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
-            num_near / system->N );
-    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
-            num_far / system->N, control->max_far_nbrs );
+    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n",
+             num_near / system->N );
+    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n",
+             num_far / system->N, control->max_far_nbrs );
 #endif
 
     //fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
     //       data->step, num_near, num_far );
 
-    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
-    //   system->N * near_nbrs->intrs_per_unit, 
+    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n",
+    //   system->N * near_nbrs->intrs_per_unit,
     //   system->N * far_nbrs->intrs_per_unit );
 }
 
 
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -603,73 +628,84 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     far_nbrs = (*lists) + FAR_NBRS;
 
     // fprintf( stderr, "\n\tentered nbrs - " );
-    if( control->ensemble == iNPT || 
-            control->ensemble == sNPT || 
+    if ( control->ensemble == iNPT ||
+            control->ensemble == sNPT ||
             control->ensemble == NPT )
+    {
         Update_Grid( system );
+    }
     // fprintf( stderr, "grid updated - " );
 
     Bin_Atoms( system, out_control );
     // fprintf( stderr, "atoms sorted - " );
     Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-    // fprintf( stderr, "function chosen - " );  
-    Reset_Neighbor_Lists( system, workspace, lists );  
+    // fprintf( stderr, "function chosen - " );
+    Reset_Neighbor_Lists( system, workspace, lists );
     // fprintf( stderr, "lists cleared - " );
 
     num_far = 0;
     c = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs = g->nbrs[i][j][k];
                 nbrs_cp = g->nbrs_cp[i][j][k];
                 fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[i][j][k]; ++l ){
+                for (l = 0; l < g->top[i][j][k]; ++l )
+                {
                     atom1 = g->atoms[i][j][k][l];
                     Set_Start_Index( atom1, num_far, far_nbrs );
                     fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] > 0 ){
+                    while ( nbrs[itr][0] > 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                        //     SQR(control->r_cut))     
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                        //     SQR(control->r_cut))
                         nbr_atoms = g->atoms[x][y][z];
                         max = g->top[x][y][z];
                         fprintf( stderr, "\t\tmax: %d\n", max );
 
 
                         /* pick up another atom from the neighbor cell -
-                           we have to compare atom1 with its own periodic images as well, 
+                           we have to compare atom1 with its own periodic images as well,
                            that's why there is also equality in the if stmt below */
-                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                            if( atom1 >= atom2 ) {
+                        for ( m = 0, atom2 = nbr_atoms[m]; m < max; ++m, atom2 = nbr_atoms[m] )
+                        {
+                            if ( atom1 >= atom2 )
+                            {
                                 Get_Far_Neighbors( system->atoms[atom1].x,
-                                        system->atoms[atom2].x, 
-                                        &(system->box), control, new_nbrs, &count );
+                                                   system->atoms[atom2].x,
+                                                   &(system->box), control, new_nbrs, &count );
                                 fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
 
-                                for( c = 0; c < count; ++c )
-                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                for ( c = 0; c < count; ++c )
+                                    if (atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d >= 0.1))
+                                    {
                                         Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                atom2, new_nbrs[c].d, 1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                                         atom2, new_nbrs[c].d, 1.0,
+                                                         new_nbrs[c].dvec, new_nbrs[c].rel_box );
                                         ++num_far;
 
                                         /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                          atom1, atom2, new_nbrs[c].d, 
-                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          atom1, atom2, new_nbrs[c].d,
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1],
                                           new_nbrs[c].dvec[2] ); */
                                     }
                             }
+                        }
 
                         ++itr;
                     }
@@ -677,22 +713,26 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                     Set_End_Index( atom1, num_far, far_nbrs );
                 }
             }
+        }
+    }
 
-    far_nbrs->num_intrs = num_far;  
+    far_nbrs->num_intrs = num_far;
     fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 
 #if defined(DEBUG)
-    for( i = 0; i < system->N; ++i ) {
-        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                compare_far_nbrs ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+               Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+               compare_far_nbrs );
     }
 
     fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
-    fprintf( stderr, "\tallocated farnbrs: %6d\n", 
-            system->N * far_nbrs->intrs_per_unit );
+    fprintf( stderr, "\tallocated farnbrs: %6d\n",
+             system->N * far_nbrs->intrs_per_unit );
 #endif
 }
 
 
+
 #endif
diff --git a/PuReMD-GPU/src/neighbors.h b/PuReMD-GPU/src/neighbors.h
index 64c14ad29d5194006aacb057a7d80ef54aeee8e4..8eb5cfc2696f4d354edcf3751dedfd315c6762a3 100644
--- a/PuReMD-GPU/src/neighbors.h
+++ b/PuReMD-GPU/src/neighbors.h
@@ -30,10 +30,8 @@ void Generate_Neighbor_Lists( reax_system*, control_params*, simulation_data*,
 int Estimate_NumNeighbors( reax_system*, control_params*,
    static_storage*, list** );
 
-int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
 
-
-static inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize)
+static inline HOST_DEVICE int index_grid_debug( int x, int y, int z, int blocksize )
 {
     return x * 8 * 8 * blocksize +  
         y * 8 * blocksize +  
diff --git a/PuReMD-GPU/src/pdb_tools.c b/PuReMD-GPU/src/pdb_tools.c
deleted file mode 100644
index a7102da2cf8d3023956539960f93f4e61c116a81..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/pdb_tools.c
+++ /dev/null
@@ -1,628 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "pdb_tools.h"
-#include "box.h"
-#include "list.h"
-#include "param.h"
-#include "restart.h"
-#include "ctype.h"
-
-
-int is_Valid_Serial( static_storage *workspace, int serial )
-{
-    if ( workspace->map_serials[ serial ] < 0 )
-    {
-        fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n",
-                 serial );
-        fprintf( stderr, "Please correct the input file.Terminating...\n" );
-        exit( INVALID_INPUT );
-    }
-
-    return 1;
-}
-
-
-int Check_Input_Range( int val, int lo, int hi, char *message )
-{
-    if ( val < lo || val > hi )
-    {
-        fprintf( stderr, "%s\nInput %d - Out of range %d-%d. Terminating...\n",
-                 message, val, lo, hi );
-        exit( INVALID_INPUT );
-    }
-
-    return 1;
-}
-
-
-void Trim_Spaces( char *element )
-{
-    int i, j;
-
-    for ( i = 0; element[i] == ' '; ++i ); // skip initial space chars
-
-    for ( j = i; j < strlen(element) && element[j] != ' '; ++j )
-        element[j - i] = toupper( element[j] ); // make uppercase, move to beginning
-    element[j - i] = 0; // finalize the string
-}
-
-
-char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
-               simulation_data *data, static_storage *workspace )
-{
-
-    FILE *pdb;
-    char **tmp;
-    char *s, *s1;
-    char descriptor[9], serial[9];
-    char atom_name[9], res_name[9], res_seq[9];
-    char s_x[9], s_y[9], s_z[9];
-    char occupancy[9], temp_factor[9];
-    char seg_id[9], element[9], charge[9];
-    char alt_loc, chain_id, icode;
-    char s_a[10], s_b[10], s_c[10], s_alpha[9], s_beta[9], s_gamma[9];
-    char s_group[12], s_zValue[9];
-    char *endptr = NULL;
-    int  i, c, c1, pdb_serial, ratom = 0;
-    /* open pdb file */
-    if ( (pdb = fopen(pdb_file, "r")) == NULL )
-    {
-        fprintf( stderr, "Error opening the pdb file!\n" );
-        exit( FILE_NOT_FOUND_ERR );
-    }
-
-
-    /* allocate memory for tokenizing pdb lines */
-    s =   (char*)  malloc( sizeof(char)  * MAX_LINE );
-    s1 =  (char*)  malloc( sizeof(char)  * MAX_LINE );
-    tmp = (char**) malloc( sizeof(char*) * MAX_TOKENS );
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        tmp[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
-
-
-    /* count number of atoms in the pdb file */
-    system->N = 0;
-    while (!feof(pdb))
-    {
-        s[0] = 0;
-        fgets( s, MAX_LINE, pdb );
-
-        tmp[0][0] = 0;
-        c = Tokenize( s, &tmp );
-
-        if ( strncmp( tmp[0], "ATOM", 4 ) == 0 ||
-                strncmp( tmp[0], "HETATM", 6 ) == 0 )
-            (system->N)++;
-    }
-    fclose(pdb);
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "system->N: %d\n", system->N );
-#endif
-
-    /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
-    for ( i = 0; i < MAX_ATOM_ID; ++i )
-        workspace->map_serials[i] = -1;
-
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
-
-
-    /* start reading and processing pdb file */
-    pdb = fopen(pdb_file, "r");
-    c = 0;
-    c1 = 0;
-
-    while (!feof(pdb))
-    {
-        /* clear previous input line */
-        s[0] = 0;
-        for ( i = 0; i < c1; ++i )
-            tmp[i][0] = 0;
-
-        /* read new line and tokenize it */
-        fgets( s, MAX_LINE, pdb );
-        strncpy( s1, s, MAX_LINE - 1 );
-        c1 = Tokenize( s, &tmp );
-
-        /* process new line */
-        if ( strncmp(tmp[0], "ATOM", 4) == 0 || strncmp(tmp[0], "HETATM", 6) == 0 )
-        {
-            if ( strncmp(tmp[0], "ATOM", 4) == 0 )
-            {
-                strncpy( &descriptor[0], s1, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], s1 + 6, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], s1 + 12, 4 );
-                atom_name[4] = 0;
-                alt_loc = s1[16];
-                strncpy( &res_name[0], s1 + 17, 3 );
-                res_name[3] = 0;
-                chain_id = s1[21];
-                strncpy( &res_seq[0], s1 + 22, 4 );
-                res_seq[4] = 0;
-                icode = s1[26];
-                strncpy( &s_x[0], s1 + 30, 8 );
-                s_x[8] = 0;
-                strncpy( &s_y[0], s1 + 38, 8 );
-                s_y[8] = 0;
-                strncpy( &s_z[0], s1 + 46, 8 );
-                s_z[8] = 0;
-                strncpy( &occupancy[0], s1 + 54, 6 );
-                occupancy[6] = 0;
-                strncpy( &temp_factor[0], s1 + 60, 6 );
-                temp_factor[6] = 0;
-                strncpy( &seg_id[0], s1 + 72, 4 );
-                seg_id[4] = 0;
-                strncpy( &element[0], s1 + 76, 2 );
-                element[2] = 0;
-                strncpy( &charge[0], s1 + 78, 2 );
-                charge[2] = 0;
-            }
-            else if (strncmp(tmp[0], "HETATM", 6) == 0)
-            {
-                strncpy( &descriptor[0], s1, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], s1 + 6, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], s1 + 12, 4 );
-                atom_name[4] = 0;
-                alt_loc = s1[16];
-                strncpy( &res_name[0], s1 + 17, 3 );
-                res_name[3] = 0;
-                chain_id = s1[21];
-                strncpy( &res_seq[0], s1 + 22, 4 );
-                res_seq[4] = 0;
-                icode = s1[26];
-                strncpy( &s_x[0], s1 + 30, 8 );
-                s_x[8] = 0;
-                strncpy( &s_y[0], s1 + 38, 8 );
-                s_y[8] = 0;
-                strncpy( &s_z[0], s1 + 46, 8 );
-                s_z[8] = 0;
-                strncpy( &occupancy[0], s1 + 54, 6 );
-                occupancy[6] = 0;
-                strncpy( &temp_factor[0], s1 + 60, 6 );
-                temp_factor[6] = 0;
-                //strncpy( &seg_id[0], s1+72, 4 );      seg_id[4] = 0;
-                strncpy( &element[0], s1 + 76, 2 );
-                element[2] = 0;
-                strncpy( &charge[0], s1 + 78, 2 );
-                charge[2] = 0;
-            }
-
-
-            /* add to mapping */
-            pdb_serial = strtod( &serial[0], &endptr );
-            Check_Input_Range( pdb_serial, 0, MAX_ATOM_ID, "Invalid pdb_serial" );
-            workspace->map_serials[ pdb_serial ] = c;
-            workspace->orig_id[ c ] = pdb_serial;
-            // fprintf( stderr, "map %d --> %d\n", pdb_serial, c );
-
-
-            /* copy atomic positions */
-            system->atoms[c].x[0] = strtod( &s_x[0], &endptr );
-            system->atoms[c].x[1] = strtod( &s_y[0], &endptr );
-            system->atoms[c].x[2] = strtod( &s_z[0], &endptr );
-
-            /* atom name and type */
-            strcpy( system->atoms[c].name, atom_name );
-            Trim_Spaces( element );
-            system->atoms[c].type = Get_Atom_Type( &(system->reaxprm), element );
-
-            /* fprintf( stderr,
-            "%d%8.3f%8.3f%8.3fq:%8.3f occ:%s temp:%s seg_id:%s element:%s\n",
-             system->atoms[c].type,
-             system->atoms[c].x[0], system->atoms[c].x[1], system->atoms[c].x[2],
-             system->atoms[c].q, occupancy, temp_factor, seg_id, element ); */
-            c++;
-        }
-        else if (!strncmp( tmp[0], "CRYST1", 6 ))
-        {
-            sscanf( s1, PDB_CRYST1_FORMAT,
-                    &descriptor[0],
-                    &s_a[0],
-                    &s_b[0],
-                    &s_c[0],
-                    &s_alpha[0],
-                    &s_beta[0],
-                    &s_gamma[0],
-                    &s_group[0],
-                    &s_zValue[0] );
-
-            /* Compute full volume tensor from the angles */
-            Init_Box_From_CRYST( atof(s_a),  atof(s_b), atof(s_c),
-                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
-                                 &(system->box) );
-        }
-
-        /* IMPORTANT: We do not check for the soundness of restrictions here.
-           When atom2 is on atom1's restricted list, and there is a restriction on
-           atom2, then atom1 has to be on atom2's restricted list, too. However,
-           we do not check if this is the case in the input file,
-           this is upto the user. */
-        else if (!strncmp( tmp[0], "CONECT", 6 ))
-        {
-            /* error check */
-            //fprintf(stderr, "CONECT: %d\n", c1 );
-            Check_Input_Range( c1 - 2, 0, MAX_RESTRICT,
-                               "CONECT line exceeds max restrictions allowed.\n" );
-
-            /* read bond restrictions */
-            if ( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[1]) ) )
-                ratom = workspace->map_serials[ pdb_serial ];
-
-            workspace->restricted[ ratom ] = c1 - 2;
-            for ( i = 2; i < c1; ++i )
-            {
-                if ( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[i]) ) )
-                    workspace->restricted_list[ (ratom * MAX_RESTRICT) +  (i - 2) ] =
-                        workspace->map_serials[ pdb_serial ];
-            }
-
-            /* fprintf( stderr, "restriction on %d:", ratom );
-            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
-             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
-             fprintf( stderr, "\n" ); */
-        }
-    }
-
-    fclose(pdb);
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "pdb file read\n" );
-#endif
-
-    return 1;
-}
-
-
-char Write_PDB( reax_system* system, control_params *control,
-                simulation_data *data, static_storage *workspace,
-                list* bonds, output_controls *out_control )
-{
-    int  i, j, k, count;
-    int  connect[4];
-    char temp[MAX_STR], name[10];
-    real bo;
-    real alpha, beta, gamma;
-
-
-    /* open output pdb file */
-    sprintf( temp, "%s%d.pdb", control->sim_name, data->step );
-    out_control->pdb = fopen( temp, "w" );
-
-
-    /* Writing Box information */
-    /* Write full volume tensor from the angles (as soon as possible) TODO_SOON */
-    gamma = acos( (system->box.box[0][0] * system->box.box[1][0] +
-                   system->box.box[0][1] * system->box.box[1][1] +
-                   system->box.box[0][2] * system->box.box[1][2]) /
-                  (system->box.box_norms[0] * system->box.box_norms[1]));
-    beta  = acos( (system->box.box[0][0] * system->box.box[2][0] +
-                   system->box.box[0][1] * system->box.box[2][1] +
-                   system->box.box[0][2] * system->box.box[2][2]) /
-                  (system->box.box_norms[0] * system->box.box_norms[2]));
-    alpha = acos( (system->box.box[2][0] * system->box.box[1][0] +
-                   system->box.box[2][1] * system->box.box[1][1] +
-                   system->box.box[2][2] * system->box.box[1][2]) /
-                  (system->box.box_norms[2] * system->box.box_norms[1]));
-
-    fprintf(out_control->pdb, PDB_CRYST1_FORMAT_O,
-            "CRYST1",
-            system->box.box_norms[0],
-            system->box.box_norms[1],
-            system->box.box_norms[2],
-            RAD2DEG(alpha),
-            RAD2DEG(beta),
-            RAD2DEG(gamma),
-            " ",
-            0);
-    fprintf( out_control->log, "Box written\n" );
-    fflush( out_control->log );
-
-    /* Writing atom information */
-    for (i = 0; i < system->N; i++)
-    {
-        strncpy( name, system->reaxprm.sbp[system->atoms[i].type].name, 2 );
-        name[2] = '\0';
-        fprintf( out_control->pdb, PDB_ATOM_FORMAT_O,
-                 "ATOM  ",
-                 workspace->orig_id[i],
-                 name,
-                 ' ',
-                 "REX",
-                 ' ',
-                 1,
-                 ' ',
-                 system->atoms[i].x[0],
-                 system->atoms[i].x[1],
-                 system->atoms[i].x[2],
-                 1.0,
-                 0.0,
-                 "0",
-                 name,
-                 "  " );
-    }
-
-    fprintf( out_control->log, "ATOM written\n" );
-    fflush( out_control->log );
-
-    /* Writing connect information */
-    for (i = 0; i < system->N; i++)
-    {
-        count = 0;
-
-        for (j = Start_Index(i, bonds); j < End_Index(i, bonds); ++j)
-        {
-            bo = bonds->select.bond_list[j].bo_data.BO;
-            if (bo > 0.3)
-            {
-                connect[count] = workspace->orig_id[bonds->select.bond_list[j].nbr];
-                count++;
-            }
-        }
-
-        fprintf( out_control->pdb, "%6s%6d", "CONECT", workspace->orig_id[i] );
-        for ( k = 0; k < count; k++ )
-            fprintf( out_control->pdb, "%6d", connect[k] );
-        fprintf( out_control->pdb, "\n" );
-    }
-
-    fprintf( out_control->pdb, "END\n" );
-
-    fclose( out_control->pdb );
-
-    return 1;
-}
-
-
-char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
-               simulation_data *data, static_storage *workspace )
-{
-    FILE *bgf;
-    char **tokens;
-    char *line, *backup;
-    char descriptor[10], serial[10];
-    char atom_name[10], res_name[10], res_seq[10];
-    char s_x[12], s_y[12], s_z[12];
-    char occupancy[10], temp_factor[10];
-    char element[10], charge[10];
-    char chain_id;
-    char s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
-    char *endptr = NULL;
-    int  i, atom_cnt, token_cnt, bgf_serial, ratom = 0;
-
-    /* open biograf file */
-    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
-    {
-        fprintf( stderr, "Error opening the bgf file!\n" );
-        exit( FILE_NOT_FOUND_ERR );
-    }
-
-
-    /* allocate memory for tokenizing biograf file lines */
-    line   = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    backup = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS );
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        tokens[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
-
-
-    /* count number of atoms in the pdb file */
-    system->N = 0;
-    while ( !feof( bgf ) )
-    {
-        line[0] = 0;
-        fgets( line, MAX_LINE, bgf );
-
-        tokens[0][0] = 0;
-        token_cnt = Tokenize( line, &tokens );
-
-        if ( !strcmp( tokens[0], "ATOM" ) || !strcmp( tokens[0], "HETATM" ) )
-            (system->N)++;
-    }
-    //fprintf( stderr, "system->N: %d\n", system->N );
-    fclose( bgf );
-
-
-    /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
-    for ( i = 0; i < MAX_ATOM_ID; ++i )
-        workspace->map_serials[i] = -1;
-
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
-
-
-    /* start reading and processing pdb file */
-    bgf = fopen( bgf_file, "r" );
-    atom_cnt = 0;
-    token_cnt = 0;
-
-    while ( !feof( bgf ) )
-    {
-        /* clear previous input line */
-        line[0] = 0;
-        for ( i = 0; i < token_cnt; ++i )
-            tokens[i][0] = 0;
-
-        /* read new line and tokenize it */
-        fgets( line, MAX_LINE, bgf );
-        strncpy( backup, line, MAX_LINE - 1 );
-        token_cnt = Tokenize( line, &tokens );
-
-        /* process new line */
-        if ( !strncmp(tokens[0], "ATOM", 4) || !strncmp(tokens[0], "HETATM", 6) )
-        {
-            if ( !strncmp(tokens[0], "ATOM", 4) )
-            {
-                strncpy( &descriptor[0], backup, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], backup + 7, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], backup + 13, 5 );
-                atom_name[5] = 0;
-                strncpy( &res_name[0], backup + 19, 3 );
-                res_name[3] = 0;
-                chain_id = backup[23];
-                strncpy( &res_seq[0], backup + 25, 5 );
-                res_seq[5] = 0;
-                strncpy( &s_x[0], backup + 30, 10 );
-                s_x[10] = 0;
-                strncpy( &s_y[0], backup + 40, 10 );
-                s_y[10] = 0;
-                strncpy( &s_z[0], backup + 50, 10 );
-                s_z[10] = 0;
-                strncpy( &element[0], backup + 61, 5 );
-                element[5] = 0;
-                strncpy( &occupancy[0], backup + 66, 3 );
-                occupancy[3] = 0;
-                strncpy( &temp_factor[0], backup + 69, 2 );
-                temp_factor[2] = 0;
-                strncpy( &charge[0], backup + 72, 8 );
-                charge[8] = 0;
-            }
-            else if ( !strncmp(tokens[0], "HETATM", 6) )
-            {
-                /* bgf hetatm:
-                   (7x,i5,1x,a5,1x,a3,1x,a1,1x,a5,3f10.5,1x,a5,i3,i2,1x,f8.5) */
-                strncpy( &descriptor[0], backup, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], backup + 7, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], backup + 13, 5 );
-                atom_name[5] = 0;
-                strncpy( &res_name[0], backup + 19, 3 );
-                res_name[3] = 0;
-                chain_id = backup[23];
-                strncpy( &res_seq[0], backup + 25, 5 );
-                res_seq[5] = 0;
-                strncpy( &s_x[0], backup + 30, 10 );
-                s_x[10] = 0;
-                strncpy( &s_y[0], backup + 40, 10 );
-                s_y[10] = 0;
-                strncpy( &s_z[0], backup + 50, 10 );
-                s_z[10] = 0;
-                strncpy( &element[0], backup + 61, 5 );
-                element[5] = 0;
-                strncpy( &occupancy[0], backup + 66, 3 );
-                occupancy[3] = 0;
-                strncpy( &temp_factor[0], backup + 69, 2 );
-                temp_factor[2] = 0;
-                strncpy( &charge[0], backup + 72, 8 );
-                charge[8] = 0;
-            }
-
-
-            /* add to mapping */
-            bgf_serial = strtod( &serial[0], &endptr );
-            Check_Input_Range( bgf_serial, 0, MAX_ATOM_ID, "Invalid bgf serial" );
-            workspace->map_serials[ bgf_serial ] = atom_cnt;
-            workspace->orig_id[ atom_cnt ] = bgf_serial;
-            // fprintf( stderr, "map %d --> %d\n", bgf_serial, atom_cnt );
-
-
-            /* copy atomic positions */
-            system->atoms[atom_cnt].x[0] = strtod( &s_x[0], &endptr );
-            system->atoms[atom_cnt].x[1] = strtod( &s_y[0], &endptr );
-            system->atoms[atom_cnt].x[2] = strtod( &s_z[0], &endptr );
-
-
-            /* atom name and type */
-            //BGF_FIX
-            atom_name[4] = 0;
-            //BGF_FIX
-
-            strcpy( system->atoms[atom_cnt].name, atom_name );
-            Trim_Spaces( element );
-            system->atoms[atom_cnt].type =
-                Get_Atom_Type( &(system->reaxprm), element );
-
-            /* fprintf( stderr,
-            "a:%3d(%1d) c:%10.5f%10.5f%10.5f q:%10.5f occ:%s temp:%s seg_id:%s element:%s\n",
-             atom_cnt, system->atoms[ atom_cnt ].type,
-             system->atoms[ atom_cnt ].x[0],
-             system->atoms[ atom_cnt ].x[1], system->atoms[ atom_cnt ].x[2],
-             system->atoms[ atom_cnt ].q, occupancy, temp_factor,
-             seg_id, element ); */
-
-            atom_cnt++;
-        }
-        else if (!strncmp( tokens[0], "CRYSTX", 6 ))
-        {
-            sscanf( backup, BGF_CRYSTX_FORMAT,
-                    &descriptor[0],
-                    &s_a[0],
-                    &s_b[0],
-                    &s_c[0],
-                    &s_alpha[0],
-                    &s_beta[0],
-                    &s_gamma[0] );
-
-            /* Compute full volume tensor from the angles */
-            Init_Box_From_CRYST( atof(s_a),  atof(s_b), atof(s_c),
-                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
-                                 &(system->box) );
-        }
-        else if (!strncmp( tokens[0], "CONECT", 6 ))
-        {
-            /* check number of restrictions */
-            Check_Input_Range( token_cnt - 2, 0, MAX_RESTRICT,
-                               "CONECT line exceeds max restrictions allowed.\n" );
-
-            /* read bond restrictions */
-            if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[1]) ) )
-                ratom = workspace->map_serials[ bgf_serial ];
-
-            workspace->restricted[ ratom ] = token_cnt - 2;
-            for ( i = 2; i < token_cnt; ++i )
-                if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[i]) ) )
-                    workspace->restricted_list[ (ratom * MAX_RESTRICT) + (i - 2) ] =
-                        workspace->map_serials[ bgf_serial ];
-
-            /* fprintf( stderr, "restriction on %d:", ratom );
-            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
-             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
-             fprintf( stderr, "\n" ); */
-        }
-    }
-
-    fclose( bgf );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "bgf file read\n" );
-#endif
-
-    return 1;
-}
diff --git a/PuReMD-GPU/src/print_utils.c b/PuReMD-GPU/src/print_utils.c
index 913ff617a23f9f395a300a0985dec7ad36c33fab..d0f0e1bad12720a793074fbbc1e4d194ebb8fd8e 100644
--- a/PuReMD-GPU/src/print_utils.c
+++ b/PuReMD-GPU/src/print_utils.c
@@ -19,9 +19,11 @@
   ----------------------------------------------------------------------*/
 
 #include "print_utils.h"
+
+#include "geo_tools.h"
 #include "list.h"
-#include "pdb_tools.h"
 #include "system_props.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
@@ -374,18 +376,6 @@ void Init_Force_Test_Functions( )
 #endif
 
 
-char *Get_Element( reax_system *system, int i )
-{
-    return &( system->reaxprm.sbp[system->atoms[i].type].name[0] );
-}
-
-
-char *Get_Atom_Name( reax_system *system, int i )
-{
-    return &(system->atoms[i].name[0]);
-}
-
-
 /* near nbrs contain both i-j, j-i nbrhood info */
 void Print_Near_Neighbors( reax_system *system, control_params *control,
                            static_storage *workspace, list **lists )
@@ -625,35 +615,48 @@ void Output_Results( reax_system *system, control_params *control,
                  data->E_vdW, data->E_Ele, data->E_Pol );
 #endif
 
-#ifdef __PRINT_CPU_RESULTS__
+#ifndef HAVE_CUDA
         t_elapsed = Get_Timing_Info( data->timing.total );
         if ( data->step == data->prev_steps )
             f_update = 1;
         else f_update = out_control->energy_update_freq;
 
-        fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f\n",
+        fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                  data->step, t_elapsed / f_update,
                  data->timing.nbrs / f_update,
                  data->timing.init_forces / f_update,
                  data->timing.bonded / f_update,
                  data->timing.nonb / f_update,
                  data->timing.QEq / f_update,
-                 (double)data->timing.matvecs / f_update );
+                 data->timing.QEq_sort_mat_rows / f_update,
+                 (double)data->timing.solver_iters / f_update,
+                 data->timing.pre_comp / f_update,
+                 data->timing.pre_app / f_update,
+                 data->timing.solver_spmv / f_update,
+                 data->timing.solver_vector_ops / f_update,
+                 data->timing.solver_orthog / f_update,
+                 data->timing.solver_tri_solve / f_update );
 #else
         t_elapsed = Get_Timing_Info( d_timing.total );
         if ( data->step == data->prev_steps )
             f_update = 1;
         else f_update = out_control->energy_update_freq;
 
-        fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f\n",
+        fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                  data->step, t_elapsed / f_update,
-                 d_timing.nbrs / f_update,
-                 d_timing.init_forces / f_update,
-                 d_timing.bonded / f_update,
-                 d_timing.nonb / f_update,
-                 d_timing.QEq / f_update,
-                 (double)d_timing.matvecs / f_update );
-
+                 d_timing->timing.nbrs / f_update,
+                 d_timing->timing.init_forces / f_update,
+                 d_timing->timing.bonded / f_update,
+                 d_timing->timing.nonb / f_update,
+                 d_timing->timing.QEq / f_update,
+                 d_timing->timing.QEq_sort_mat_rows / f_update,
+                 (double)d_timing->timing.solver_iters / f_update,
+                 d_timing->timing.pre_comp / f_update,
+                 d_timing->timing.pre_app / f_update,
+                 d_timing->timing.solver_spmv / f_update,
+                 d_timing->timing.solver_vector_ops / f_update,
+                 d_timing->timing.solver_orthog / f_update,
+                 d_timing->timing.solver_tri_solve / f_update );
 #endif
 
         //fprintf (stderr, " total %10.5f \n", t_elapsed);
@@ -673,16 +676,32 @@ void Output_Results( reax_system *system, control_params *control,
         data->timing.init_forces = 0;
         data->timing.bonded = 0;
         data->timing.nonb = 0;
-        data->timing.QEq = 0;
-        data->timing.matvecs = 0;
-
+        data->timing.QEq = ZERO;
+        data->timing.QEq_sort_mat_rows = ZERO;
+        data->timing.pre_comp = ZERO;
+        data->timing.pre_app = ZERO;
+        data->timing.solver_iters = 0;
+        data->timing.solver_spmv = ZERO;
+        data->timing.solver_vector_ops = ZERO;
+        data->timing.solver_orthog = ZERO;
+        data->timing.solver_tri_solve = ZERO;
+
+#ifdef HAVE_CUDA
         d_timing.total = Get_Time( );
         d_timing.nbrs = 0;
         d_timing.init_forces = 0;
         d_timing.bonded = 0;
         d_timing.nonb = 0;
-        d_timing.QEq = 0;
-        d_timing.matvecs = 0;
+        d_timing->timing.QEq = ZERO;
+        d_timing->timing.QEq_sort_mat_rows = ZERO;
+        d_timing->timing.pre_comp = ZERO;
+        d_timing->timing.pre_app = ZERO;
+        d_timing->timing.solver_iters = 0;
+        d_timing->timing.solver_spmv = ZERO;
+        d_timing->timing.solver_vector_ops = ZERO;
+        d_timing->timing.solver_orthog = ZERO;
+        d_timing->timing.solver_tri_solve = ZERO;
+#endif
 
         fflush( out_control->out );
         fflush( out_control->pot );
@@ -716,16 +735,16 @@ void Output_Results( reax_system *system, control_params *control,
     if ( out_control->write_steps > 0 &&
             data->step % out_control->write_steps == 0 )
     {
-        // t_start = Get_Time( );
+        //t_start = Get_Time( );
         out_control->append_traj_frame( system, control, data,
                                         workspace, lists, out_control );
 
-        //Write_PDB( system, control, data, workspace, *lists+BONDS, out_control );
-        // t_elapsed = Get_Timing_Info( t_start );
-        // fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
+        //Write_PDB( system, *lists+BONDS, data, control, workspace, out_control );
+        //t_elapsed = Get_Timing_Info( t_start );
+        //fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
     }
 
-    // fprintf( stderr, "output_results... done\n" );
+    //fprintf( stderr, "output_results... done\n" );
 }
 
 
@@ -759,23 +778,46 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
     sprintf( fname, "%s.H%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
-    H = &workspace->H;
+    H = workspace->H;
 
     for ( i = 0; i < system->N; ++i )
     {
         for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
         {
             fprintf( out, "%6d%6d %24.15e\n",
-                     workspace->orig_id[i], workspace->orig_id[H->entries[j].j],
-                     H->entries[j].val );
+                     workspace->orig_id[i], workspace->orig_id[H->j[j]],
+                     H->val[j] );
 
             fprintf( out, "%6d%6d %24.15e\n",
-                     workspace->orig_id[H->entries[j].j], workspace->orig_id[i],
-                     H->entries[j].val );
+                     workspace->orig_id[H->j[j]], workspace->orig_id[i],
+                     H->val[j] );
         }
         // the diagonal entry
         fprintf( out, "%6d%6d %24.15e\n",
-                 workspace->orig_id[i], workspace->orig_id[i], H->entries[j].val );
+                 workspace->orig_id[i], workspace->orig_id[i], H->val[j] );
+    }
+
+    fclose( out );
+
+    sprintf( fname, "%s.H_sp%d.out", control->sim_name, step );
+    out = fopen( fname, "w" );
+    H = workspace->H_sp;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
+        {
+            fprintf( out, "%6d%6d %24.15e\n",
+                     workspace->orig_id[i], workspace->orig_id[H->j[j]],
+                     H->val[j] );
+
+            fprintf( out, "%6d%6d %24.15e\n",
+                     workspace->orig_id[H->j[j]], workspace->orig_id[i],
+                     H->val[j] );
+        }
+        // the diagonal entry
+        fprintf( out, "%6d%6d %24.15e\n",
+                 workspace->orig_id[i], workspace->orig_id[i], H->val[j] );
     }
 
     fclose( out );
@@ -834,11 +876,11 @@ void Print_Sparse_Matrix( sparse_matrix *A )
 {
     int i, j;
 
-    for ( i = 0; i < 10; ++i )
+    for ( i = 0; i < A->n; ++i )
     {
         fprintf( stderr, "i:%d  j(val):", i );
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
-            fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
+            fprintf( stderr, "%d(%.4f) ", A->j[j], A->val[j] );
         fprintf( stderr, "\n" );
     }
 }
@@ -850,8 +892,14 @@ void Print_Sparse_Matrix2( sparse_matrix *A, char *fname )
     FILE *f = fopen( fname, "w" );
 
     for ( i = 0; i < A->n; ++i )
+    {
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
-            fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
+        {
+            //fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
+            //Convert 0-based to 1-based (for Matlab)
+            fprintf( f, "%6d%6d %24.15e\n", i+1, A->j[j]+1, A->val[j] );
+        }
+    }
 
     fclose(f);
 }
diff --git a/PuReMD-GPU/src/print_utils.h b/PuReMD-GPU/src/print_utils.h
index 5f479bdc99fa30c518f69b5a23fa88b19af1a306..46d08516e00b002792d507b5effd7bd1ee5d551d 100644
--- a/PuReMD-GPU/src/print_utils.h
+++ b/PuReMD-GPU/src/print_utils.h
@@ -23,28 +23,25 @@
 
 #include "mytypes.h"
 
+
 typedef void (*print_interaction)(reax_system*, control_params*, simulation_data*,
-                                  static_storage*, list**, output_controls*);
-extern print_interaction Print_Interactions[NO_OF_INTERACTIONS];
+        static_storage*, list**, output_controls*);
 
-char *Get_Element( reax_system*, int );
+extern print_interaction Print_Interactions[NO_OF_INTERACTIONS];
 
-char *Get_Atom_Name( reax_system*, int );
 
-void Print_Near_Neighbors( reax_system*, control_params*, static_storage*,
-                           list** );
+void Print_Near_Neighbors( reax_system*, control_params*, static_storage*, list** );
 
-void Print_Far_Neighbors( reax_system*, control_params*, static_storage*,
-                          list** );
+void Print_Far_Neighbors( reax_system*, control_params*, static_storage*, list** );
 
 void Print_Total_Force( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Output_Results( reax_system*, control_params*, simulation_data*,
-                     static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Print_Bond_Orders( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Print_Linear_System( reax_system*, control_params*, static_storage*, int );
 
@@ -61,23 +58,23 @@ void Print_Bond_List2( reax_system*, list*, char* );
 
 #ifdef TEST_FORCES
 void Dummy_Printer( reax_system*, control_params*, simulation_data*,
-                    static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_Bond_Forces( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_LonePair_Forces( reax_system*, control_params*, simulation_data*,
-                            static_storage*, list**, output_controls* );
-void Print_OverUnderCoor_Forces(reax_system*, control_params*, simulation_data*,
-                                static_storage*, list**, output_controls*);
+        static_storage*, list**, output_controls* );
+void Print_OverUnderCoor_Forces(reax_system*, control_params*,
+        simulation_data*, static_storage*, list**, output_controls*);
 void Print_Three_Body_Forces( reax_system*, control_params*, simulation_data*,
-                              static_storage*, list**, output_controls* );
-void Print_Hydrogen_Bond_Forces(reax_system*, control_params*, simulation_data*,
-                                static_storage*, list**, output_controls*);
+        static_storage*, list**, output_controls* );
+void Print_Hydrogen_Bond_Forces(reax_system*, control_params*,
+        simulation_data*, static_storage*, list**, output_controls*);
 void Print_Four_Body_Forces( reax_system*, control_params*, simulation_data*,
-                             static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_vdW_Coulomb_Forces( reax_system*, control_params*, simulation_data*,
-                               static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Compare_Total_Forces( reax_system*, control_params*, simulation_data*,
-                           static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Init_Force_Test_Functions( );
 #endif
 
diff --git a/PuReMD-GPU/src/qeq.c b/PuReMD-GPU/src/qeq.c
new file mode 100644
index 0000000000000000000000000000000000000000..a319e89b872d34e3e3da7ca9121a8a8b63a4d83c
--- /dev/null
+++ b/PuReMD-GPU/src/qeq.c
@@ -0,0 +1,1667 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "qeq.h"
+
+#include "allocate.h"
+#include "index_utils.h"
+#include "list.h"
+#include "lin_alg.h"
+#include "print_utils.h"
+#include "tool_box.h"
+#if defined(HAVE_SUPERLU_MT)
+#include "slu_mt_ddefs.h"
+#endif
+
+
+#if defined(TEST_MAT)
+static sparse_matrix * create_test_mat( void )
+{
+    unsigned int i, n;
+    sparse_matrix *H_test;
+
+    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    //3x3, SPD, store lower half
+    i = 0;
+    n = 0;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 4.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 12.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = 37.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = -16.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = -43.;
+    ++i;
+    H_test->j[i] = 2;
+    H_test->val[i] = 98.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+
+    return H_test;
+}
+#endif
+
+
+/* Routine used with qsort for sorting nonzeros within a sparse matrix row
+ *
+ * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
+ */
+static int compare_matrix_entry(const void *v1, const void *v2)
+{
+    /* larger element has larger column index */
+    return *(unsigned int *)v1 - *(unsigned int *)v2;
+}
+
+
+/* Routine used for sorting nonzeros within a sparse matrix row;
+ *  internally, a combination of qsort and manual sorting is utilized
+ *  (parallel calls to qsort when multithreading, rows mapped to threads)
+ *
+ * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
+ */
+static void Sort_Matrix_Rows( sparse_matrix * const A )
+{
+    unsigned int i, j, k, si, ei, *temp_j;
+    real *temp_val;
+
+    #pragma omp parallel default(none) private(i, j, k, si, ei, temp_j, temp_val) shared(stderr)
+    {
+        if ( ( temp_j = (unsigned int*) malloc( A->n * sizeof(unsigned int)) ) == NULL
+                || ( temp_val = (real*) malloc( A->n * sizeof(real)) ) == NULL )
+        {
+            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        /* sort each row of A using column indices */
+        #pragma omp for schedule(guided)
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+            ei = A->start[i + 1];
+            memcpy( temp_j, A->j + si, sizeof(unsigned int) * (ei - si) );
+            memcpy( temp_val, A->val + si, sizeof(real) * (ei - si) );
+
+            //TODO: consider implementing single custom one-pass sort instead of using qsort + manual sort
+            /* polymorphic sort in standard C library using column indices */
+            qsort( temp_j, ei - si, sizeof(unsigned int), compare_matrix_entry );
+
+            /* manually sort vals */
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                for ( k = 0; k < (ei - si); ++k )
+                {
+                    if ( A->j[si + j] == temp_j[k] )
+                    {
+                        A->val[si + k] = temp_val[j];
+                        break;
+                    }
+
+                }
+            }
+
+            /* copy sorted column indices */
+            memcpy( A->j + si, temp_j, sizeof(unsigned int) * (ei - si) );
+        }
+
+        free( temp_val );
+        free( temp_j );
+    }
+}
+
+
+static void Calculate_Droptol( const sparse_matrix * const A, real * const droptol,
+        const real dtol )
+{
+    int i, j, k;
+    real val;
+#ifdef _OPENMP
+    static real *droptol_local;
+    unsigned int tid;
+#endif
+
+    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
+    {
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+
+        #pragma omp master
+        {
+            /* keep b_local for program duration to avoid allocate/free
+             * overhead per Sparse_MatVec call*/
+            if ( droptol_local == NULL )
+            {
+                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
+                {
+                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* init droptol to 0 */
+        for ( i = 0; i < A->n; ++i )
+        {
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] = 0.0;
+#else
+            droptol[i] = 0.0;
+#endif
+        }
+
+        #pragma omp barrier
+
+        /* calculate sqaure of the norm of each row */
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
+            {
+                j = A->j[k];
+                val = A->val[k];
+
+#ifdef _OPENMP
+                droptol_local[tid * A->n + i] += val * val;
+                droptol_local[tid * A->n + j] += val * val;
+#else
+                droptol[i] += val * val;
+                droptol[j] += val * val;
+#endif
+            }
+
+            val = A->val[k]; // diagonal entry
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] += val * val;
+#else
+            droptol[i] += val * val;
+#endif
+        }
+
+        #pragma omp barrier
+
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            droptol[i] = 0.0;
+            for ( k = 0; k < omp_get_num_threads(); ++k )
+            {
+                droptol[i] += droptol_local[k * A->n + i];
+            }
+        }
+#endif
+
+        #pragma omp barrier
+
+        /* calculate local droptol for each row */
+        //fprintf( stderr, "droptol: " );
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            //fprintf( stderr, "%f-->", droptol[i] );
+            droptol[i] = SQRT( droptol[i] ) * dtol;
+            //fprintf( stderr, "%f  ", droptol[i] );
+        }
+        //fprintf( stderr, "\n" );
+    }
+}
+
+
+static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
+{
+    int i, j, pj;
+    int fillin;
+    real val;
+
+    fillin = 0;
+
+    #pragma omp parallel for schedule(static) \
+    default(none) private(i, j, pj, val) reduction(+: fillin)
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            j = A->j[pj];
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                ++fillin;
+            }
+        }
+    }
+
+    return fillin + A->n;
+}
+
+
+#if defined(HAVE_SUPERLU_MT)
+static real SuperLU_Factorize( const sparse_matrix * const A,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, pj, count, *Ltop, *Utop, r;
+    sparse_matrix *A_t;
+    SuperMatrix A_S, AC_S, L_S, U_S;
+    NCformat *A_S_store;
+    SCPformat *L_S_store;
+    NCPformat *U_S_store;
+    superlumt_options_t superlumt_options;
+    pxgstrf_shared_t pxgstrf_shared;
+    pdgstrf_threadarg_t *pdgstrf_threadarg;
+    int_t nprocs;
+    fact_t fact;
+    trans_t trans;
+    yes_no_t refact, usepr;
+    real u, drop_tol;
+    real *a, *at;
+    int_t *asub, *atsub, *xa, *xat;
+    int_t *perm_c; /* column permutation vector */
+    int_t *perm_r; /* row permutations from partial pivoting */
+    void *work;
+    int_t info, lwork;
+    int_t permc_spec, panel_size, relax;
+    Gstat_t Gstat;
+    flops_t flopcnt;
+
+    /* Default parameters to control factorization. */
+#ifdef _OPENMP
+    //TODO: set as global parameter and use
+    #pragma omp parallel \
+    default(none) shared(nprocs)
+    {
+        #pragma omp master
+        {
+            /* SuperLU_MT spawns threads internally, so set and pass parameter */
+            nprocs = omp_get_num_threads();
+        }
+    }
+#else
+    nprocs = 1;
+#endif
+
+//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
+    fact = DOFACT; /* factor from scratch */
+    trans = NOTRANS;
+    refact = NO; /* first time factorization */
+    //TODO: add to control file and use the value there to set these
+    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
+    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
+    u = 1.0; /* diagonal pivoting threshold */
+    usepr = NO;
+    drop_tol = 0.0;
+    work = NULL;
+    lwork = 0;
+
+//#if defined(DEBUG)
+    fprintf( stderr, "nprocs = %d\n", nprocs );
+    fprintf( stderr, "Panel size = %d\n", panel_size );
+    fprintf( stderr, "Relax = %d\n", relax );
+//#endif
+
+    if ( !(perm_r = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_r[].");
+    }
+    if ( !(perm_c = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_c[].");
+    }
+    if ( !(superlumt_options.etree = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for etree[].");
+    }
+    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
+    }
+    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for part_super__h[].");
+    }
+    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
+            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
+            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
+            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
+    {
+        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
+    count = 0;
+    for ( i = 0; i < A->n; ++i )
+    {
+        xa[i] = count;
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            a[count] = A->entries[pj].val;
+            asub[count] = A->entries[pj].j;
+            ++count;
+        }
+        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            a[count] = A_t->entries[pj].val;
+            asub[count] = A_t->entries[pj].j;
+            ++count;
+        }
+    }
+    xa[i] = count;
+
+    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
+                         &at, &atsub, &xat );
+
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", asub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", a[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xa[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", atsub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", at[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xat[i] );
+    fprintf( stderr, "\n" );
+
+    A_S.Stype = SLU_NC; /* column-wise, no supernode */
+    A_S.Dtype = SLU_D; /* double-precision */
+    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
+    A_S.nrow = A->n;
+    A_S.ncol = A->n;
+    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    A_S_store = (NCformat *) A_S.Store;
+    A_S_store->nnz = 2 * A->start[A->n] - A->n;
+    A_S_store->nzval = at;
+    A_S_store->rowind = atsub;
+    A_S_store->colptr = xat;
+
+    /* ------------------------------------------------------------
+       Allocate storage and initialize statistics variables.
+       ------------------------------------------------------------*/
+    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
+    StatInit( A->n, nprocs, &Gstat );
+
+    /* ------------------------------------------------------------
+       Get column permutation vector perm_c[], according to permc_spec:
+       permc_spec = 0: natural ordering
+       permc_spec = 1: minimum degree ordering on structure of A'*A
+       permc_spec = 2: minimum degree ordering on structure of A'+A
+       permc_spec = 3: approximate minimum degree for unsymmetric matrices
+       ------------------------------------------------------------*/
+    permc_spec = 0;
+    get_perm_c( permc_spec, &A_S, perm_c );
+
+    /* ------------------------------------------------------------
+       Initialize the option structure superlumt_options using the
+       user-input parameters;
+       Apply perm_c to the columns of original A to form AC.
+       ------------------------------------------------------------*/
+    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
+                  u, usepr, drop_tol, perm_c, perm_r,
+                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
+
+    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    /* ------------------------------------------------------------
+       Compute the LU factorization of A.
+       The following routine will create nprocs threads.
+       ------------------------------------------------------------*/
+    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
+
+    fprintf( stderr, "INFO: %d\n", info );
+
+    flopcnt = 0;
+    for (i = 0; i < nprocs; ++i)
+    {
+        flopcnt += Gstat.procstat[i].fcops;
+    }
+    Gstat.ops[FACT] = flopcnt;
+
+//#if defined(DEBUG)
+    printf("\n** Result of sparse LU **\n");
+    L_S_store = (SCPformat *) L_S.Store;
+    U_S_store = (NCPformat *) U_S.Store;
+    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
+    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
+    fflush( stdout );
+//#endif
+
+    /* convert L and R from SuperLU formats to CSR */
+    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
+    memset( Utop, 0, (A->n + 1) * sizeof(int) );
+    memset( L->start, 0, (A->n + 1) * sizeof(int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(int) );
+
+    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
+    for ( i = 0; i < A->n; ++i )
+    {
+        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
+        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
+        //TODO: correct for SCPformat for L?
+        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
+//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
+//        {
+//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
+//        }
+        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
+        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            ++Utop[U_S_store->rowind[pj] + 1];
+            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
+        }
+    }
+    for ( i = 1; i <= A->n; ++i )
+    {
+//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
+        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
+//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
+//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
+    }
+    for ( i = 0; i < A->n; ++i )
+    {
+//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
+//        {
+//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
+//            L->entries[Ltop[r]].j = r;
+//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
+//            ++Ltop[r];
+//        }
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            r = U_S_store->rowind[pj];
+            U->entries[Utop[r]].j = i;
+            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
+            ++Utop[r];
+        }
+    }
+
+    /* ------------------------------------------------------------
+      Deallocate storage after factorization.
+      ------------------------------------------------------------*/
+    pxgstrf_finalize( &superlumt_options, &AC_S );
+    Deallocate_Matrix( A_t );
+    free( xa );
+    free( asub );
+    free( a );
+    SUPERLU_FREE( perm_r );
+    SUPERLU_FREE( perm_c );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
+    SUPERLU_FREE( A_S.Store );
+    if ( lwork == 0 )
+    {
+        Destroy_SuperNode_SCP(&L_S);
+        Destroy_CompCol_NCP(&U_S);
+    }
+    else if ( lwork > 0 )
+    {
+        SUPERLU_FREE(work);
+    }
+    StatFree(&Gstat);
+
+    free( Utop );
+    free( Ltop );
+
+    //TODO: return iters
+    return 0.;
+}
+#endif
+
+
+/* Diagonal (Jacobi) preconditioner computation */
+static real diag_pre_comp( const reax_system * const system, real * const Hdia_inv )
+{
+    unsigned int i;
+    real start;
+
+    start = Get_Time( );
+
+    #pragma omp parallel for schedule(static) \
+    default(none) private(i)
+    for ( i = 0; i < system->N; ++i )
+    {
+        Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta;
+    }
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Incomplete Cholesky factorization with dual thresholding */
+static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    int *tmp_j;
+    real *tmp_val;
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val, start;
+    int *Utop;
+
+    start = Get_Time( );
+
+    if ( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ||
+            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
+            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ICHOLT preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( Utop, 0, A->n * sizeof(unsigned int) );
+
+    //fprintf( stderr, "n: %d\n", A->n );
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            j = A->j[pj];
+            val = A->val[pj];
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+
+            if ( FABS(val) > droptol[i] )
+            {
+                k1 = 0;
+                k2 = L->start[j];
+                while ( k1 < tmptop && k2 < L->start[j + 1] )
+                {
+                    if ( tmp_j[k1] < L->j[k2] )
+                    {
+                        ++k1;
+                    }
+                    else if ( tmp_j[k1] > L->j[k2] )
+                    {
+                        ++k2;
+                    }
+                    else
+                    {
+                        val -= (tmp_val[k1++] * L->val[k2++]);
+                    }
+                }
+
+                // L matrix is lower triangular,
+                // so right before the start of next row comes jth diagonal
+                val /= L->val[L->start[j + 1] - 1];
+
+                tmp_j[tmptop] = j;
+                tmp_val[tmptop] = val;
+                ++tmptop;
+            }
+            //fprintf( stderr, " -- done\n" );
+        }
+
+        // sanity check
+        if ( A->j[pj] != i )
+        {
+            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            exit( NUMERIC_BREAKDOWN );
+        }
+
+        // compute the ith diagonal in L
+        val = A->val[pj];
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            val -= (tmp_val[k1] * tmp_val[k1]);
+        }
+
+        tmp_j[tmptop] = i;
+        tmp_val[tmptop] = SQRT(val);
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
+            {
+                L->j[Ltop] = tmp_j[k1];
+                L->val[Ltop] = tmp_val[k1];
+                U->start[tmp_j[k1] + 1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        }
+        // keep the diagonal in any case
+        L->j[Ltop] = tmp_j[k1];
+        L->val[Ltop] = tmp_val[k1];
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    /* U = L^T (Cholesky factorization) */
+    Transpose( L, U );
+//    for ( i = 1; i <= U->n; ++i )
+//    {
+//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
+//    }
+//    for ( i = 0; i < L->n; ++i )
+//    {
+//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
+//        {
+//            j = L->j[pj];
+//            U->j[Utop[j]] = i;
+//            U->val[Utop[j]] = L->val[pj];
+//            Utop[j]++;
+//        }
+//    }
+
+//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+
+    free( tmp_val );
+    free( tmp_j );
+    free( Utop );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete Cholesky factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp. */
+static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+        sparse_matrix * const U_t, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+    int *Utop;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D_inv, D) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1. / D_inv[i];
+    }
+
+    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
+    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./sqrt(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(DAD, D_inv, D) private(i, pj)
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.;
+    }
+
+    /* initial guesses for U^T,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
+    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero */
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
+        for ( j = 0; j < A->start[A->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 0; k <= A->n; ++k )
+            {
+                if ( U_t->start[k] > j )
+                {
+                    x = U_t->start[k - 1];
+                    ei_x = U_t->start[k];
+                    break;
+                }
+            }
+            /* column bounds of current nonzero */
+            y = U_t->start[U_t->j[j]];
+            ei_y = U_t->start[U_t->j[j] + 1];
+
+            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
+            while ( U_t->j[x] < U_t->j[j] &&
+                    U_t->j[y] < U_t->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_t->j[x] == U_t->j[y] )
+                {
+                    sum += (U_t->val[x] * U_t->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_t->j[x] < U_t->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            sum = DAD->val[j] - sum;
+
+            /* diagonal entries */
+            if ( (k - 1) == U_t->j[j] )
+            {
+                /* sanity check */
+                if ( sum < ZERO )
+                {
+                    fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n");
+#if defined(DEBUG_FOCUS)
+                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
+                             k - 1, A->entries[j].j, A->entries[j].val );
+                    fprintf( stderr, "sum = %10.3f\n", sum);
+#endif
+                    exit(NUMERIC_BREAKDOWN);
+                }
+
+                U_t->val[j] = SQRT( sum );
+            }
+            /* non-diagonal entries */
+            else
+            {
+                U_t->val[j] = sum / U_t->val[ei_y - 1];
+            }
+        }
+    }
+
+    /* apply inverse transformation D^{-1}U^{T},
+     * since DAD \approx U^{T}U, so
+     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(D_inv) private(i, pj)
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            U_t->val[pj] *= D_inv[i];
+        }
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
+#endif
+
+    /* transpose U^{T} and copy into U */
+    Transpose( U_t, U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    free(D_inv);
+    free(D);
+    free(Utop);
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete LU factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+                     sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ILU_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./sqrt(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+    #pragma omp parallel for schedule(static) default(none) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->val[L->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L->j[x] < L->j[j] &&
+                    L->j[y] < L->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L->j[x] == L->j[y] )
+                {
+                    sum += (L->val[x] * U->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L->j[x] < L->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
+            }
+        }
+
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U->j[x] < U->j[j] &&
+                    U->j[y] < U->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U->j[x] == U->j[y] )
+                {
+                    sum += (L->val[y] * U->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U->j[x] < U->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D_inv) private(i, pj)
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L->val[pj] = D_inv[i] * L->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U->val[pj] = U->val[pj] * D_inv[i];
+        }
+    }
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    free( D_inv );
+    free( D );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete LU factorization with thresholding
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * droptol: row-wise tolerances used for dropping
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
+        const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD, *L_temp, *U_temp;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( DAD, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( L_temp, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( U_temp, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./sqrt(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i) shared(L_temp)
+    for ( i = 0; i < A->n; ++i )
+    {
+        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L_temp->j[x] < L_temp->j[j] &&
+                    L_temp->j[y] < L_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L_temp->j[x] == L_temp->j[y] )
+                {
+                    sum += (L_temp->val[x] * U_temp->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L_temp->j[x] < L_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
+            }
+        }
+
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U_temp->j[x] < U_temp->j[j] &&
+                    U_temp->j[y] < U_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_temp->j[x] == U_temp->j[y] )
+                {
+                    sum += (L_temp->val[y] * U_temp->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_temp->j[x] < U_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U_temp->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
+        }
+    }
+
+    /* apply the dropping rule */
+    Ltop = 0;
+    Utop = 0;
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        L->start[i] = Ltop;
+        U->start[i] = Utop;
+
+        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
+            {
+                L->j[Ltop] = L_temp->j[pj];
+                L->val[Ltop] = L_temp->val[pj];
+                ++Ltop;
+            }
+        }
+
+        /* diagonal */
+        L->j[Ltop] = L_temp->j[pj];
+        L->val[Ltop] = L_temp->val[pj];
+        ++Ltop;
+
+        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
+            {
+                U->j[Utop] = U_temp->j[pj];
+                U->val[Utop] = U_temp->val[pj];
+                ++Utop;
+            }
+        }
+
+        /* diagonal */
+        U->j[Utop] = U_temp->j[pj];
+        U->val[Utop] = U_temp->val[pj];
+        ++Utop;
+    }
+
+    L->start[i] = Ltop;
+    U->start[i] = Utop;
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
+    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
+#endif
+
+    Deallocate_Matrix( U_temp );
+    Deallocate_Matrix( L_temp );
+    Deallocate_Matrix( DAD );
+    free( D_inv );
+    free( D );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Setup routine which performs the following:
+ *  1) init storage for QEq matrices and other dependent routines
+ *  2) compute preconditioner (if sim. step matches refactor step)
+ *  3) extrapolate ficticious charges s and t
+ */
+static void Init_MatVec( const reax_system * const system, const control_params * const control,
+        simulation_data * const data, static_storage * const workspace, const list * const far_nbrs )
+{
+    int i, fillin;
+    real s_tmp, t_tmp, time;
+    sparse_matrix *Hptr;
+//    char fname[100];
+
+    if (control->qeq_domain_sparsify_enabled)
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    if (control->pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->pre_comp_refactor == 0 || workspace->L == NULL))
+    {
+        //Print_Linear_System( system, control, workspace, data->step );
+
+        time = Get_Time( );
+        if ( control->pre_comp_type != DIAG_PC )
+        {
+            Sort_Matrix_Rows( workspace->H );
+            if ( control->qeq_domain_sparsify_enabled == TRUE )
+            {
+                Sort_Matrix_Rows( workspace->H_sp );
+            }
+
+            if ( control->pre_app_type == TRI_SOLVE_GC_PA )
+            {
+                if ( control->qeq_domain_sparsify_enabled == TRUE )
+                {
+                    Hptr = setup_graph_coloring( workspace->H_sp );
+                }
+                else
+                {
+                    Hptr = setup_graph_coloring( workspace->H );
+                }
+
+                Sort_Matrix_Rows( Hptr );
+            }
+        }
+        data->timing.QEq_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(DEBUG)
+        fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            if ( workspace->Hdia_inv == NULL )
+            {
+                if ( ( workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ) ) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            data->timing.pre_comp += diag_pre_comp( system, workspace->Hdia_inv );
+            break;
+
+        case ICHOLT_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
+                if ( Allocate_Matrix( workspace->L, far_nbrs->n, fillin ) == FAILURE ||
+                        Allocate_Matrix( workspace->U, far_nbrs->n, fillin ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+
+#if defined(DEBUG)
+                fprintf( stderr, "fillin = %d\n", fillin );
+                fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                         fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
+#endif
+            }
+
+            data->timing.pre_comp += ICHOLT( Hptr, workspace->droptol, workspace->L, workspace->U );
+            break;
+
+        case ILU_PAR_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( workspace->L, Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( workspace->U, Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+
+            data->timing.pre_comp += ILU_PAR( Hptr, control->pre_comp_sweeps, workspace->L, workspace->U );
+            break;
+
+        case ILUT_PAR_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->pre_comp_droptol );
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
+                if ( Allocate_Matrix( workspace->L, Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( workspace->U, Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+
+            data->timing.pre_comp += ILUT_PAR( Hptr, workspace->droptol, control->pre_comp_sweeps,
+                    workspace->L, workspace->U );
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( workspace->L, Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( workspace->U, Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+
+#if defined(HAVE_SUPERLU_MT)
+            data->timing.pre_comp += SuperLU_Factorize( Hptr, workspace->L, workspace->U );
+#else
+            fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+#endif
+
+#if defined(DEBUG_FOCUS)
+        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->L, fname );
+        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->U, fname );
+
+        fprintf( stderr, "icholt-" );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
+#endif
+    }
+
+    /* extrapolation for s & t */
+    //TODO: good candidate for vectorization, avoid moving data with head pointer and circular buffer
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, s_tmp, t_tmp)
+    for ( i = 0; i < system->N; ++i )
+    {
+        // no extrapolation
+        //s_tmp = workspace->s[index_wkspace_sys(0,i,system->N)];
+        //t_tmp = workspace->t[index_wkspace_sys(0,i,system->N)];
+
+        // linear
+        //s_tmp = 2 * workspace->s[index_wkspace_sys(0,i,system->N)] - workspace->s[index_wkspace_sys(1,i,system->N)];
+        //t_tmp = 2 * workspace->t[index_wkspace_sys(0,i,system->N)] - workspace->t[index_wkspace_sys(1,i,system->N)];
+
+        // quadratic
+//        s_tmp = workspace->s[index_wkspace_sys(2,i,system->N)] +
+//            3 * (workspace->s[index_wkspace_sys(0,i,system->N)]-workspace->s[index_wkspace_sys(1,i,system->N)]);
+        t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] +
+            3 * (workspace->t[index_wkspace_sys(0,i,system->N)] -workspace->t[index_wkspace_sys(1,i,system->N)]);
+
+        // cubic
+        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) -
+            (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)]);
+        //t_tmp = 4 * (workspace->t[index_wkspace_sys(0,i,system->N)] + workspace->t[index_wkspace_sys(2,i,system->N)]) -
+        //  (6 * workspace->t[index_wkspace_sys(1,i,system->N)] + workspace->t[index_wkspace_sys(3,i,system->N)] );
+
+        // 4th order
+//        s_tmp = 5 * (workspace->s[index_wkspace_sys(0,i,system->N)] -
+//                workspace->s[index_wkspace_sys(3,i,system->N)]) + 10 *
+//            (-workspace->s[index_wkspace_sys(1,i,system->N)] +
+//             workspace->s[index_wkspace_sys(2,i,system->N)] ) +
+//            workspace->s[index_wkspace_sys(4,i,system->N)];
+//        t_tmp = 5 * (workspace->t[index_wkspace_sys(0,i,system->N)] -
+//                workspace->t[index_wkspace_sys(3,i,system->N)]) + 10 *
+//            (-workspace->t[index_wkspace_sys(1,i,system->N)] +
+//             workspace->t[index_wkspace_sys(2,i,system->N)] ) +
+//            workspace->t[index_wkspace_sys(4,i,system->N)];
+
+        workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)];
+        workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; 
+        workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)];
+        workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)];
+        workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp;
+
+        workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)];
+        workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; 
+        workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)];
+        workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)];
+        workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp;
+    }
+}
+
+
+/* Combine ficticious charges s and t to get atomic charge q
+ */
+static void Calculate_Charges( const reax_system * const system, static_storage * const workspace )
+{
+    int i;
+    real u, s_sum, t_sum;
+
+    s_sum = t_sum = 0.;
+    for ( i = 0; i < system->N; ++i )
+    {
+        s_sum += workspace->s[index_wkspace_sys(0,i,system->N)];
+        t_sum += workspace->t[index_wkspace_sys(0,i,system->N)];
+    }
+
+    u = s_sum / t_sum;
+    for ( i = 0; i < system->N; ++i )
+    {
+        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)]
+            - u * workspace->t[index_wkspace_sys(0,i,system->N)];
+    }
+}
+
+
+/* Main driver method for QEq kernel
+ *
+ * Rough outline:
+ *  1) init / setup routines
+ *  2) perform 2 linear solves
+ *  3) compute atomic charges based on output of 2)
+ */
+void QEq( reax_system * const system, control_params * const control, simulation_data * const data,
+          static_storage * const workspace, const list * const far_nbrs,
+          const output_controls * const out_control )
+{
+    int iters;
+
+    Init_MatVec( system, control, data, workspace, far_nbrs );
+
+    switch ( control->qeq_solver_type )
+    {
+    case GMRES_S:
+        iters = GMRES( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log,
+                ((data->step - data->prev_steps) % control->pre_comp_refactor == 0) ? TRUE : FALSE );
+        iters += GMRES( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, FALSE );
+        break;
+    case GMRES_H_S:
+        iters = GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, (data->step - data->prev_steps) % control->pre_comp_refactor == 0 );
+        iters += GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, 0 );
+        break;
+    case CG_S:
+        iters = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
+        iters += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
+        break;
+    case SDM_S:
+        iters = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
+        iters += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
+        break;
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    data->timing.solver_iters += iters;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+#endif
+
+    Calculate_Charges( system, workspace );
+}
diff --git a/PuReMD-GPU/src/qeq.h b/PuReMD-GPU/src/qeq.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c6c7ea2ce396f5f3cd9a538b801f2658199b7d9
--- /dev/null
+++ b/PuReMD-GPU/src/qeq.h
@@ -0,0 +1,73 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __QEq_H_
+#define __QEq_H_
+
+#include "mytypes.h"
+
+
+void QEq( reax_system* const, control_params* const, simulation_data* const,
+          static_storage* const, const list* const,
+          const output_controls* const );
+
+
+//static inline HOST_DEVICE void swap( sparse_matrix_entry *array,
+//        int index1, int index2 ) 
+//{
+//    sparse_matrix_entry temp = array[index1];
+//    array[index1] = array[index2];
+//    array[index2] = temp;
+//}
+//
+//
+//static inline HOST_DEVICE void quick_sort( sparse_matrix_entry *array,
+//        int start, int end )
+//{
+//    int i = start;
+//    int k = end; 
+//
+//    if (end - start >= 1)  
+//    {  
+//        int pivot = array[start].j;
+//
+//        while (k > i) 
+//        {  
+//            while ((array[i].j <= pivot) && (i <= end) && (k > i))
+//            {
+//                i++;
+//            }
+//            while ((array[k].j > pivot) && (k >= start) && (k >= i))
+//            {
+//                k--;
+//            }
+//            if (k > i)
+//            {
+//                swap( array, i, k );
+//            }
+//        }  
+//        swap( array, start, k );
+//        quick_sort( array, start, k - 1 );
+//        quick_sort( array, k + 1, end );
+//    }  
+//}
+
+
+#endif
diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h
index b19bc58e3dcef04a324b108be718bfbff3e5c06c..a936477278d06a989d50ab0faeafb8a737a4e5fd 100644
--- a/PuReMD-GPU/src/random.h
+++ b/PuReMD-GPU/src/random.h
@@ -58,7 +58,7 @@ static inline HOST_DEVICE double GRandom(double mean, double sigma)
         rsq = v1 * v1 + v2 * v2;
     }
 
-    return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
 }
 
 
diff --git a/PuReMD-GPU/src/reset_utils.c b/PuReMD-GPU/src/reset_utils.c
index f79596aa9d29a65f673448d18a28c73c00444e43..ecb921bb00255081bec5470baaab070df8cb80ef 100644
--- a/PuReMD-GPU/src/reset_utils.c
+++ b/PuReMD-GPU/src/reset_utils.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -28,18 +29,20 @@ void Reset_Atoms( reax_system* system )
 {
     int i;
 
-    for( i = 0; i < system->N; ++i )
-        memset( system->atoms[i].f, 0.0, RVEC_SIZE );
+    for ( i = 0; i < system->N; ++i )
+    {
+        memset( system->atoms[i].f, 0.0, sizeof(rvec) );
+    }
 }
 
 
 void Reset_Pressures( simulation_data *data )
 {
-    rtensor_MakeZero( data->flex_bar.P );  
+    rtensor_MakeZero( data->flex_bar.P );
     data->iso_bar.P = 0;
     rvec_MakeZero( data->int_press );
     rvec_MakeZero( data->ext_press );
-    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
+    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 }
 
@@ -97,49 +100,57 @@ void Reset_Workspace( reax_system *system, static_storage *workspace )
 }
 
 
-void Reset_Neighbor_Lists( reax_system *system, control_params *control, 
+void Reset_Neighbor_Lists( reax_system *system, control_params *control,
         static_storage *workspace, list **lists )
 {
     int i, tmp;
     list *bonds = (*lists) + BONDS;
     list *hbonds = (*lists) + HBONDS;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         tmp = Start_Index( i, bonds );
         Set_End_Index( i, tmp, bonds );
     }
 
-    //TODO check if this is needed
-    memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
+    //TODO: added for GPU, verify if correct
+    memset( bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
 
-    if( control->hb_cut > 0 )
-        for( i = 0; i < system->N; ++i )
-            if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
+    if ( control->hb_cut > 0 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            if ( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1)
+            {
                 tmp = Start_Index( workspace->hbond_index[i], hbonds );
                 Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
-                /* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
-                   i, Start_Index( workspace->hbond_index[i], hbonds ), 
+                /* fprintf( stderr, "i:%d, hbond: %d-%d\n",
+                   i, Start_Index( workspace->hbond_index[i], hbonds ),
                    End_Index( workspace->hbond_index[i], hbonds ) );*/
             }
+        }
+    }
 }
 
 
-void Reset( reax_system *system, control_params *control,  
+void Reset( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace, list **lists  )
 {
     Reset_Atoms( system );
 
     Reset_Simulation_Data( data );
 
-    if( control->ensemble == NPT || control->ensemble == sNPT || 
+    if ( control->ensemble == NPT || control->ensemble == sNPT ||
             control->ensemble == iNPT )
+    {
         Reset_Pressures( data );
+    }
 
-    Reset_Workspace( system, workspace );  
+    Reset_Workspace( system, workspace );
 
     Reset_Neighbor_Lists( system, control, workspace, lists );
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "reset - ");
 #endif
 }
@@ -147,16 +158,18 @@ void Reset( reax_system *system, control_params *control,
 
 void Reset_Grid( grid *g )
 {
-    memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
+    memset( g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2] );
 }
 
 
+
 void Reset_Marks( grid *g, ivec *grid_stack, int grid_top )
 {
     int i;
 
-    for( i = 0; i < grid_top; ++i )
-        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
-            grid_stack[i][1] * g->ncell[2] + 
-            grid_stack[i][2]] = 0;
+    for ( i = 0; i < grid_top; ++i )
+    {
+        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2]
+            + grid_stack[i][1] * g->ncell[2] + grid_stack[i][2]] = 0;
+    }
 }
diff --git a/PuReMD-GPU/src/restart.c b/PuReMD-GPU/src/restart.c
index b6ccb014d91ad33cd337d7688345d06811b2c681..13abdecc8142c5f40b942b79c2e886246372576c 100644
--- a/PuReMD-GPU/src/restart.c
+++ b/PuReMD-GPU/src/restart.c
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -19,11 +20,12 @@
   ----------------------------------------------------------------------*/
 
 #include "restart.h"
+
 #include "box.h"
 #include "vector.h"
 
 void Write_Binary_Restart( reax_system *system, control_params *control,
-                           simulation_data *data, static_storage *workspace )
+        simulation_data *data, static_storage *workspace )
 {
     int  i;
     char fname[MAX_STR];
@@ -65,8 +67,8 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
 
 
 void Read_Binary_Restart( char *fname, reax_system *system,
-                          control_params *control, simulation_data *data,
-                          static_storage *workspace )
+        control_params *control, simulation_data *data,
+        static_storage *workspace )
 {
     int i;
     FILE *fres;
@@ -103,14 +105,13 @@ void Read_Binary_Restart( char *fname, reax_system *system,
 
     workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
+    {
         workspace->map_serials[i] = -1;
+    }
 
     workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //CHANGE
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -175,8 +176,7 @@ void Write_ASCII_Restart( reax_system *system, control_params *control,
 
 
 void Read_ASCII_Restart( char *fname, reax_system *system,
-                         control_params *control, simulation_data *data,
-                         static_storage *workspace )
+        control_params *control, simulation_data *data, static_storage *workspace )
 {
     int i;
     FILE *fres;
@@ -185,8 +185,7 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
     fres = fopen( fname, "r" );
 
     /* header */
-    //fscanf( fres, READ_RESTART_HEADER,
-    fscanf( fres, RESTART_HEADER,
+    fscanf( fres, READ_RESTART_HEADER,
             &data->prev_steps, &system->N, &data->therm.T, &data->therm.xi,
             &data->therm.v_xi, &data->therm.v_xi_old, &data->therm.G_xi,
             &system->box.box[0][0], &system->box.box[0][1], &system->box.box[0][2],
@@ -194,7 +193,7 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
             &system->box.box[2][0], &system->box.box[2][1], &system->box.box[2][2]);
     Make_Consistent( &(system->box) );
 
-//#if defined(DEBUG_FOCUS)
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "restart step: %d\n", data->prev_steps );
     fprintf( stderr, "restart thermostat: %10.6f %10.6f %10.6f %10.6f %10.6f\n",
              data->therm.T, data->therm.xi,
@@ -204,22 +203,20 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
              system->box.box[0][0], system->box.box[0][1], system->box.box[0][2],
              system->box.box[1][0], system->box.box[1][1], system->box.box[1][2],
              system->box.box[2][0], system->box.box[2][1], system->box.box[2][2] );
-    fprintf ( stderr, "Total Atoms read: %d \n", system->N);
-//#endif
+#endif
 
     /* memory allocations for atoms, atom maps, bond restrictions */
     system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
 
     workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
+    {
         workspace->map_serials[i] = -1;
+    }
 
     workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //CHANGE
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -240,11 +237,15 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
 
 
 void Write_Restart( reax_system *system, control_params *control,
-                    simulation_data *data, static_storage *workspace,
-                    output_controls *out_control )
+        simulation_data *data, static_storage *workspace, output_controls
+        *out_control )
 {
     if ( out_control->restart_format == WRITE_ASCII )
+    {
         Write_ASCII_Restart( system, control, data, workspace );
+    }
     else if ( out_control->restart_format == WRITE_BINARY )
+    {
         Write_Binary_Restart( system, control, data, workspace );
+    }
 }
diff --git a/PuReMD-GPU/src/single_body_interactions.c b/PuReMD-GPU/src/single_body_interactions.c
index b26f493e703819f066389991a4845acab113b326..4c5824dd9862770863aa3e3299ca4f1f691c561e 100644
--- a/PuReMD-GPU/src/single_body_interactions.c
+++ b/PuReMD-GPU/src/single_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -27,11 +28,8 @@
 #include "vector.h"
 
 
-void LonePair_OverUnder_Coordination_Energy( reax_system *system, 
-        control_params *control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+void LonePair_OverUnder_Coordination_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, j, pj, type_i, type_j;
@@ -49,7 +47,7 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     bond_data *pbond;
-    bond_order_data *bo_ij; 
+    bond_order_data *bo_ij;
     list *bonds = (*lists) + BONDS;
 
     /* Initialize parameters */
@@ -61,64 +59,71 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
     p_ovun7 = system->reaxprm.gp.l[8];
     p_ovun8 = system->reaxprm.gp.l[9];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         /* set the parameter pointer */
         type_i = system->atoms[i].type;
         sbp_i = &(system->reaxprm.sbp[ type_i ]);
 
         /* lone-pair Energy */
-        p_lp2 = sbp_i->p_lp2;      
+        p_lp2 = sbp_i->p_lp2;
         expvd2 = EXP( -75 * workspace->Delta_lp[i] );
         inv_expvd2 = 1. / (1. + expvd2 );
 
         /* calculate the energy */
-        data->E_Lp += e_lp = 
-            p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+        data->E_Lp += e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
 
-        dElp = p_lp2 * inv_expvd2 + 
-            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+        dElp = p_lp2 * inv_expvd2 +
+               75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
         CElp = dElp * workspace->dDelta_lp[i];
 
         workspace->CdDelta[i] += CElp;      // lp - 1st term
 
 #ifdef TEST_ENERGY
-        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
-                p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n",
+                 p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
         fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
-                workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
+                 workspace->orig_id[i] + 1, workspace->nlp[i], e_lp, data->E_Lp );
 #endif
+
 #ifdef TEST_FORCES
         Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
 #endif
 
         /* correction for C2 */
-        if( system->reaxprm.gp.l[5] > 0.001 && 
+        if ( system->reaxprm.gp.l[5] > 0.001 &&
                 !strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
-            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-                if( i < bonds->select.bond_list[pj].nbr ) {
+        {
+            for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            {
+                if ( i < bonds->select.bond_list[pj].nbr )
+                {
                     j = bonds->select.bond_list[pj].nbr;
                     type_j = system->atoms[j].type;
 
-                    if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
+                    if ( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) )
+                    {
                         twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
                         bo_ij = &( bonds->select.bond_list[pj].bo_data );
                         Di = workspace->Delta[i];
-                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                        vov3 = bo_ij->BO - Di - 0.040 * POW(Di, 4.);
 
-                        if( vov3 > 3. ) {
-                            data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
+                        if ( vov3 > 3. )
+                        {
+                            data->E_Lp += e_lph = p_lp3 * SQR(vov3 - 3.0);
                             //estrain(i) += e_lph;
 
-                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                            deahu2dbo = 2.*p_lp3 * (vov3 - 3.);
+                            deahu2dsbo = 2.*p_lp3 * (vov3 - 3.) * (-1. - 0.16 * POW(Di, 3.));
 
                             bo_ij->Cdbo += deahu2dbo;
                             workspace->CdDelta[i] += deahu2dsbo;
 #ifdef TEST_ENERGY
-                            fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
+                            fprintf(out_control->elp, "C2cor%6d%6d%23.15e%23.15e%23.15e\n",
                                     // workspace->orig_id[i], workspace->orig_id[j],
-                                    i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
+                                    i + 1, j + 1, e_lph, deahu2dbo, deahu2dsbo );
 #endif
+
 #ifdef TEST_FORCES
                             Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
                             Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
@@ -127,44 +132,52 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
                     }
 
                 }
+            }
+        }
     }
 
-
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         type_i = system->atoms[i].type;
         sbp_i = &(system->reaxprm.sbp[ type_i ]);
 
         /* over-coordination energy */
-        if( sbp_i->mass > 21.0 ) 
+        if ( sbp_i->mass > 21.0 )
+        {
             dfvl = 0.0;
-        else dfvl = 1.0; // only for 1st-row elements
+        }
+        else
+        {
+            dfvl = 1.0; // only for 1st-row elements
+        }
 
         p_ovun2 = sbp_i->p_ovun2;
         sum_ovun1 = 0;
         sum_ovun2 = 0;
 
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             j = bonds->select.bond_list[pj].nbr;
-            type_j = system->atoms[j].type;      
+            type_j = system->atoms[j].type;
             bo_ij = &(bonds->select.bond_list[pj].bo_data);
             sbp_j = &(system->reaxprm.sbp[ type_j ]);
             twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
             sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-            sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-                ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
-              i+1, j+1, 
-              dfvl * workspace->Delta_lp_temp[j],
-              sbp_j->nlp_opt,
-              workspace->nlp_temp[j] );*/
+            sum_ovun2 += (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]) *
+                         ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+
+            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n",
+            i+1, j+1,
+            dfvl * workspace->Delta_lp_temp[j],
+            sbp_j->nlp_opt,
+            workspace->nlp_temp[j] );*/
         }
 
         exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
         inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-        Delta_lpcorr  = workspace->Delta[i] - 
-            (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+        Delta_lpcorr  = workspace->Delta[i] -
+            (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
         exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
         inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
@@ -175,11 +188,11 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         data->E_Ov += e_ov = sum_ovun1 * CEover1;
 
         CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-            ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+            ( 1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
 
-        CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+        CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 );
 
-        CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) *
             p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
@@ -193,14 +206,13 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
         inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-        data->E_Un += e_un =
-            -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+        data->E_Un += e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
-        CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+        CEunder1 = inv_exp_ovun2n * ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
                 p_ovun2 * e_un * exp_ovun2n);
         CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-        CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-        CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        CEunder3 = CEunder1 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1);
+        CEunder4 = CEunder1 * (dfvl * workspace->Delta_lp_temp[i]) *
             p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
         //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
@@ -215,100 +227,98 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
 #endif
 
-
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             pbond = &(bonds->select.bond_list[pj]);
             j = pbond->nbr;
             type_j = system->atoms[j].type;
             bo_ij = &(pbond->bo_data);
             twbp  = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
-
-            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
-            workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-                (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
-            bo_ij->Cdbopi += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-            bo_ij->Cdbopi2 += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st
+            workspace->CdDelta[j] += CEover4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                                     (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+            bo_ij->Cdbopi += CEover4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //OvCoor-3b
+            bo_ij->Cdbopi2 += CEover4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //OvCoor-3b
 
 
-            workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
+            workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
                 (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
-            bo_ij->Cdbopi += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-            bo_ij->Cdbopi2 += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-
+            bo_ij->Cdbopi += CEunder4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //UnCoor-2b
+            bo_ij->Cdbopi2 += CEunder4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //UnCoor-2b
 
 #ifdef TEST_ENERGY
             /* fprintf( out_control->eov, "%6d%23.15e%23.15e"
-               workspace->orig_id[j]+1,
-            //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
-            CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
-
-            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-              workspace->orig_id[j]+1, 
-              CEover4,
-              CEover4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
-              (1.0 - dfvl*workspace->dDelta_lp[j]),
-              CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            workspace->orig_id[j]+1,
+             //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
+             CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
+
+            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+            workspace->orig_id[j]+1,
+            CEover4,
+            CEover4*
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2),
+            (1.0 - dfvl*workspace->dDelta_lp[j]),
+            CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
+            (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
 
             /* fprintf( out_control->eun, "%6d%23.15e\n",
-               workspace->orig_id[j]+1, CEunder3 ); */
+            workspace->orig_id[j]+1, CEunder3 ); */
 
             /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-              workspace->orig_id[j]+1,
-              CEunder4,
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            workspace->orig_id[j]+1,
+            CEunder4,
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEunder4*
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+            (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
 #endif
 
 #ifdef TEST_FORCES
-            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
-                    workspace->f_ov ); // OvCoor - 1st term
+            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s,
+                     workspace->f_ov ); // OvCoor - 1st term
 
             Add_dDelta( system, lists, j,
-                    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
+                        CEover4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                        (bo_ij->BO_pi + bo_ij->BO_pi2), workspace->f_ov ); //OvCoor3a
 
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+            Add_dBOpinpi2( system, lists, i, pj,
+                           CEover4 * (workspace->Delta[j] -
+                                      dfvl * workspace->Delta_lp_temp[j]),
+                           CEover4 * (workspace->Delta[j] -
+                                      dfvl * workspace->Delta_lp_temp[j]),
+                           workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
 
             Add_dDelta( system, lists, j,
-                    CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi + bo_ij->BO_pi2),
-                    workspace->f_un ); // UnCoor - 2a
-
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_un, workspace->f_un ); // UnCoor - 2b
+                        CEunder4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                        (bo_ij->BO_pi + bo_ij->BO_pi2),
+                        workspace->f_un ); // UnCoor - 2a
+
+            Add_dBOpinpi2( system, lists, i, pj,
+                           CEunder4 * (workspace->Delta[j] -
+                                       dfvl * workspace->Delta_lp_temp[j]),
+                           CEunder4 * (workspace->Delta[j] -
+                                       dfvl * workspace->Delta_lp_temp[j]),
+                           workspace->f_un, workspace->f_un ); // UnCoor - 2b
 #endif
         }
 
-#ifdef TEST_ENERGY      
+#ifdef TEST_ENERGY
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
-                i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
+        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n",
+                 i + 1, DlpVi, Delta_lpcorr, sbp_i->valency );
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n",
+                 i + 1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n",
+                 i + 1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
 #endif
     }
 }
diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c
index 0126b86b776dce8fd30aea0c228731b95104b216..fc93a474cf378f1a382d0ae017cf15a9b23eb17a 100644
--- a/PuReMD-GPU/src/system_props.c
+++ b/PuReMD-GPU/src/system_props.c
@@ -1,67 +1,50 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "system_props.h"
-
-#include "box.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-HOST real Get_Time( )
-{
-    struct timeval tim;
-
-    gettimeofday(&tim, NULL );
-    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
-}
-
-
-HOST real Get_Timing_Info( real t_start )
-{
-    struct timeval tim;
-    real t_end;
-
-    gettimeofday(&tim, NULL );
-    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
-    return (t_end - t_start);
-}
-
-
-void Temperature_Control( control_params *control, simulation_data *data, 
-        output_controls *out_control )
+void Temperature_Control( control_params *control, simulation_data *data,
+                          output_controls *out_control )
 {
     real tmp;
 
-    if( control->T_mode == 1 ) { // step-wise temperature control
-        if( (data->step - data->prev_steps) % 
-                ((int)(control->T_freq / control->dt)) == 0 ) {
-            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+    if ( control->T_mode == 1 )  // step-wise temperature control
+    {
+        if ( (data->step - data->prev_steps) %
+                ((int)(control->T_freq / control->dt)) == 0 )
+        {
+            if ( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
                 control->T += control->T_rate;
-            else control->T = control->T_final;     
+            else control->T = control->T_final;
         }
     }
-    else if( control->T_mode == 2 ) { // constant slope control
+    else if ( control->T_mode == 2 )  // constant slope control
+    {
         tmp = control->T_rate * control->dt / control->T_freq;
 
-        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
-            control->T += tmp;       
+        if ( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+            control->T += tmp;
     }
 }
 
@@ -69,39 +52,34 @@ void Temperature_Control( control_params *control, simulation_data *data,
 void Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
     int i;
-    int blocks;
-    int block_size;
-    real    *partial_sums = 0;
 
     data->M = 0;
 
-    for( i = 0; i < system->N; i++ ) 
-        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
+    for ( i = 0; i < system->N; i++ )
+        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
-    data->inv_M = 1. / data->M;    
+    //fprintf ( stderr, "Compute_total_Mass -->%f<-- \n", data->M );
+    data->inv_M = 1. / data->M;
 }
 
 
-void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-        FILE *fout )
+void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
+                             FILE *fout )
 {
     int i;
     real m, xx, xy, xz, yy, yz, zz, det;
     rvec tvec, diff;
     rtensor mat, inv;
 
-    int blocks;
-    int block_size;
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    real t_start, t_end;
-
     rvec_MakeZero( data->xcm );  // position of CoM
     rvec_MakeZero( data->vcm );  // velocity of CoM
     rvec_MakeZero( data->amcm ); // angular momentum of CoM
     rvec_MakeZero( data->avcm ); // angular velocity of CoM
 
+
     /* Compute the position, velocity and angular momentum about the CoM */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
         rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
@@ -111,11 +89,13 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         rvec_ScaledAdd( data->amcm, m, tvec );
 
         /*fprintf( fout,"%3d  %g %g %g\n",
-          i+1, 
+          i+1,
           system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-          fprintf( fout, "vcm:  %g %g %g\n", 
-          data->vcm[0], data->vcm[1], data->vcm[2] );  
-         */
+          fprintf( fout, "vcm:  %g %g %g\n",
+          data->vcm[0], data->vcm[1], data->vcm[2] );
+        */
+        /* fprintf( stderr, "amcm: %12.6f %12.6f %12.6f\n",
+           data->amcm[0], data->amcm[1], data->amcm[2] ); */
     }
 
     rvec_Scale( data->xcm, data->inv_M, data->xcm );
@@ -129,7 +109,8 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     /* Calculate and then invert the inertial tensor */
     xx = xy = xz = yy = yz = zz = 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
         rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
@@ -138,19 +119,10 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         xz += diff[0] * diff[2] * m;
         yy += diff[1] * diff[1] * m;
         yz += diff[1] * diff[2] * m;
-        zz += diff[2] * diff[2] * m;      
+        zz += diff[2] * diff[2] * m;
     }
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " xx: %f \n", xx);
-    fprintf (stderr, " xy: %f \n", xy);
-    fprintf (stderr, " xz: %f \n", xz);
-    fprintf (stderr, " yy: %f \n", yy);
-    fprintf (stderr, " yz: %f \n", yz);
-    fprintf (stderr, " zz: %f \n", zz);
-#endif
-
-    mat[0][0] = yy + zz;     
+    mat[0][0] = yy + zz;
     mat[0][1] = mat[1][0] = -xy;
     mat[0][2] = mat[2][0] = -xz;
     mat[1][1] = xx + zz;
@@ -158,12 +130,12 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     mat[2][2] = xx + yy;
 
     /* invert the inertial tensor */
-    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-            mat[0][1] * mat[1][2] * mat[2][0] + 
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] +
+            mat[0][1] * mat[1][2] * mat[2][0] +
             mat[0][2] * mat[1][0] * mat[2][1] ) -
-        ( mat[0][0] * mat[1][2] * mat[2][1] + 
-          mat[0][1] * mat[1][0] * mat[2][2] + 
-          mat[0][2] * mat[1][1] * mat[2][0] );
+          ( mat[0][0] * mat[1][2] * mat[2][1] +
+            mat[0][1] * mat[1][0] * mat[2][2] +
+            mat[0][2] * mat[1][1] * mat[2][0] );
 
     inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
     inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
@@ -175,33 +147,33 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
     inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
-    if( fabs(det) > ALMOST_ZERO )
-        rtensor_Scale( inv, 1./det, inv );
-    else 
+    if ( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1. / det, inv );
+    else
         rtensor_MakeZero( inv );
 
     /* Compute the angular velocity about the centre of mass */
-    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    rtensor_MatVec( data->avcm, inv, data->amcm );
     data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
 
 #if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-            data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-            data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-            data->amcm[0], data->amcm[1], data->amcm[2] );
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
+             data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
+             data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
+             data->amcm[0], data->amcm[1], data->amcm[2] );
     /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2], 
-       mat[1][0], mat[1][1], mat[1][2], 
+       mat[0][0], mat[0][1], mat[0][2],
+       mat[1][0], mat[1][1], mat[1][2],
        mat[2][0], mat[2][1], mat[2][2] );
        fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2], 
-       inv[1][0], inv[1][1], inv[1][2], 
+       inv[0][0], inv[0][1], inv[0][2],
+       inv[1][0], inv[1][1], inv[1][2],
        inv[2][0], inv[2][1], inv[2][2] );
        fflush( fout ); */
-    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-            data->avcm[0], data->avcm[1], data->avcm[2] );
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n",
+             data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
 
@@ -214,7 +186,8 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 
     data->E_Kin = 0.0;
 
-    for (i=0; i < system->N; i++) {
+    for (i = 0; i < system->N; i++)
+    {
         m = system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_Scale( p, m, system->atoms[i].v );
@@ -232,17 +205,17 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 }
 
 
-/* IMPORTANT: This function assumes that current kinetic energy and 
- *  the center of mass of the system is already computed before. 
+/* IMPORTANT: This function assumes that current kinetic energy and
+ *  the center of mass of the system is already computed before.
  *
- * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
- *  to be added when there are long-range interactions or long-range 
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+ *  to be added when there are long-range interactions or long-range
  *  corrections to short-range interactions present.
- *  We may want to add that for more accuracy. 
+ *  We may want to add that for more accuracy.
  */
-void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
-        simulation_data* data, 
-        output_controls *out_control )
+void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
+                                 simulation_data* data,
+                                 output_controls *out_control )
 {
     int i;
     reax_atom *p_atom;
@@ -254,8 +227,10 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
     rvec_MakeZero( data->int_press );
 
     // 0: both int and ext, 1: ext only, 2: int only
-    if( control->press_mode == 0 || control->press_mode == 2 ) {
-        for( i = 0; i < system->N; ++i ) {
+    if ( control->press_mode == 0 || control->press_mode == 2 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
             p_atom = &( system->atoms[i] );
 
             /* transform x into unitbox coordinates */
@@ -265,13 +240,14 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
             rvec_Multiply( tmp, p_atom->f, tx );
             rvec_Add( data->int_press, tmp );
 
-            if( out_control->debug_level > 0 ) {
-                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
-                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
-                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
-                        data->int_press[0],data->int_press[1],data->int_press[2]);
+            if ( out_control->debug_level > 0 )
+            {
+                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f",
+                         i + 1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f",
+                         p_atom->f[0], p_atom->f[1], p_atom->f[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n",
+                         data->int_press[0], data->int_press[1], data->int_press[2]);
             }
         }
     }
@@ -279,53 +255,53 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
     /* kinetic contribution */
     data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
 
-    /* Calculate total pressure in each direction */  
-    data->tot_press[0] = data->kin_press - 
-        ((data->int_press[0] + data->ext_press[0]) /
-         (box->box_norms[1] * box->box_norms[2] * P_CONV));
+    /* Calculate total pressure in each direction */
+    data->tot_press[0] = data->kin_press -
+                         ((data->int_press[0] + data->ext_press[0]) /
+                          (box->box_norms[1] * box->box_norms[2] * P_CONV));
 
-    data->tot_press[1] = data->kin_press - 
-        ((data->int_press[1] + data->ext_press[1])/
-         (box->box_norms[0] * box->box_norms[2] * P_CONV));
+    data->tot_press[1] = data->kin_press -
+                         ((data->int_press[1] + data->ext_press[1]) /
+                          (box->box_norms[0] * box->box_norms[2] * P_CONV));
 
-    data->tot_press[2] = data->kin_press - 
-        ((data->int_press[2] + data->ext_press[2])/
-         (box->box_norms[0] * box->box_norms[1] * P_CONV));
+    data->tot_press[2] = data->kin_press -
+                         ((data->int_press[2] + data->ext_press[2]) /
+                          (box->box_norms[0] * box->box_norms[1] * P_CONV));
 
     /* Average pressure for the whole box */
-    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
+    data->iso_bar.P = (data->tot_press[0] + data->tot_press[1] + data->tot_press[2]) / 3;
 }
 
 
-void Compute_Pressure_Isotropic_Klein( reax_system* system, 
-        simulation_data* data )
+void Compute_Pressure_Isotropic_Klein( reax_system* system,
+                                       simulation_data* data )
 {
     int i;
     reax_atom *p_atom;
     rvec dx;
 
-    // IMPORTANT: This function assumes that current kinetic energy and 
+    // IMPORTANT: This function assumes that current kinetic energy and
     // the center of mass of the system is already computed before.
     data->iso_bar.P = 2.0 * data->E_Kin;
 
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         p_atom = &( system->atoms[i] );
-        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
+        rvec_ScaledSum(dx, 1.0, p_atom->x, -1.0, data->xcm);
         data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
     }
 
     data->iso_bar.P /= (3.0 * system->box.volume);
 
-    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
-    // to be added when there are long-range interactions or long-range 
+    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+    // to be added when there are long-range interactions or long-range
     // corrections to short-range interactions present.
     // We may want to add that for more accuracy.
 }
 
 
-void Compute_Pressure( reax_system* system, simulation_data* data, 
-        static_storage *workspace )
+void Compute_Pressure( reax_system* system, simulation_data* data,
+                       static_storage *workspace )
 {
     int i;
     reax_atom *p_atom;
@@ -333,13 +309,14 @@ void Compute_Pressure( reax_system* system, simulation_data* data,
 
     rtensor_MakeZero( data->flex_bar.P );
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         p_atom = &( system->atoms[i] );
         // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
         rvec_OuterProduct( temp, p_atom->v, p_atom->v );
-        rtensor_ScaledAdd( data->flex_bar.P, 
-                system->reaxprm.sbp[ p_atom->type ].mass, temp );
-        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
+        rtensor_ScaledAdd( data->flex_bar.P,
+                           system->reaxprm.sbp[ p_atom->type ].mass, temp );
+        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x );
         rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
     }
 
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index 874132451d02b2d62d87c82065874f04a35b2d37..e2cc98350167a763ac6acdac5807d710403210bd 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -28,10 +28,6 @@
 extern "C"  {
 #endif
 
-real Get_Time( );
-
-real Get_Timing_Info( real );
-
 void Temperature_Control( control_params*, simulation_data*, output_controls* );
 
 void Compute_Total_Mass( reax_system*, simulation_data* );
diff --git a/PuReMD-GPU/src/testmd.c b/PuReMD-GPU/src/testmd.c
index 57d8859df4645f982b93803415cb408c57a564f7..b5204950ee733f30cd69c13d28c5cfd6b2200246 100644
--- a/PuReMD-GPU/src/testmd.c
+++ b/PuReMD-GPU/src/testmd.c
@@ -22,20 +22,23 @@
 
 #include "analyze.h"
 #include "box.h"
+#include "control.h"
+#include "ffield.h"
 #include "forces.h"
+#include "geo_tools.h"
 #include "grid.h"
 #include "init_md.h"
 #include "integrate.h"
 #include "neighbors.h"
-#include "param.h"
-#include "pdb_tools.h"
 #include "print_utils.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
 #include "traj.h"
+#include "tool_box.h"
 #include "vector.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_environment.h"
 #include "cuda_forces.h"
 #include "cuda_init_md.h"
@@ -43,9 +46,9 @@
 #include "cuda_post_evolve.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-
 #ifdef __BUILD_DEBUG__
-  #include "validation.h"
+  #include "cuda_validation.h"
+#endif
 #endif
 
 
@@ -69,9 +72,12 @@ int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 int MATVEC_BLOCKS;
 
 
-void Post_Evolve( reax_system* system, control_params* control, 
-        simulation_data* data, static_storage* workspace, 
-        list** lists, output_controls *out_control )
+static void Post_Evolve( reax_system * const system,
+        control_params * const control,
+        simulation_data * const data,
+        static_storage * const workspace,
+        list ** const lists,
+        output_controls * const out_control )
 {
     int i;
     rvec diff, cross;
@@ -110,15 +116,27 @@ void Post_Evolve( reax_system* system, control_params* control,
 }
 
 
-void Read_System( char *geof, char *ff, char *ctrlf, 
-        reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        output_controls *out_control )
+void static Read_System( char * const geo_file,
+        char * const ffield_file,
+        char * const control_file,
+        reax_system * const system,
+        control_params * const control,
+        simulation_data * const data,
+        static_storage * const workspace,
+        output_controls * const out_control )
 {
     FILE *ffield, *ctrl;
 
-    ffield = fopen( ff, "r" );
-    ctrl = fopen( ctrlf, "r" );
+    if ( (ffield = fopen( ffield_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the ffield file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+    if ( (ctrl = fopen( control_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the ffield file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
 
     /* ffield file */
     Read_Force_Field( ffield, &(system->reaxprm) );
@@ -127,32 +145,31 @@ void Read_System( char *geof, char *ff, char *ctrlf,
     Read_Control_File( ctrl, system, control, out_control );
 
     /* geo file */
-    if( control->geo_format == XYZ )
+    if( control->geo_format == CUSTOM )
     {
-        fprintf( stderr, "xyz input is not implemented yet\n" );
-        exit( 1 );
+        Read_Geo( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == PDB ) 
     {
-        Read_PDB( geof, system, control, data, workspace );
+        Read_PDB( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == BGF ) 
     {
-        Read_BGF( geof, system, control, data, workspace );
+        Read_BGF( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == ASCII_RESTART )
     {
-        Read_ASCII_Restart( geof, system, control, data, workspace );
+        Read_ASCII_Restart( geo_file, system, control, data, workspace );
         control->restart = 1;
     }
     else if( control->geo_format == BINARY_RESTART ) {
-        Read_Binary_Restart( geof, system, control, data, workspace );
+        Read_Binary_Restart( geo_file, system, control, data, workspace );
         control->restart = 1;
     }
     else
     {
         fprintf( stderr, "unknown geo file format. terminating!\n" );
-        exit( 1 );
+        exit( INVALID_GEO );
     }  
 
 #if defined(DEBUG_FOCUS)
@@ -172,7 +189,14 @@ void Init_Data_Structures( simulation_data *data )
 }
 
 
-int main( int argc, char* argv[] )
+static void usage(char* argv[])
+{
+    fprintf(stderr, "usage: ./%s geometry ffield control\n", argv[0]);
+}
+
+
+#ifdef HAVE_CUDA
+static void gpu_main( int argc, char* argv[] )
 {
     reax_system system;
     control_params control;
@@ -183,7 +207,6 @@ int main( int argc, char* argv[] )
     evolve_function Evolve;
     evolve_function Cuda_Evolve;
     int steps;
-
     real t_start, t_elapsed;
     real *results = NULL;
 
@@ -259,8 +282,8 @@ int main( int argc, char* argv[] )
 #ifdef __BUILD_DEBUG__
     if( !validate_device (&system, &data, &workspace, &lists) )
     {
-        fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
-        exit (1);
+        fprintf( stderr, " Results does not match between Device and host @ step --> %d \n", data.step );
+        exit( 1 );
     }
 #endif
 
@@ -331,6 +354,92 @@ int main( int argc, char* argv[] )
     fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
 
     Cleanup_Cuda_Environment( );
+}
+
+
+#else
+static void cpu_main( int argc, char* argv[] )
+{
+    reax_system system;
+    control_params control;
+    simulation_data data;
+    static_storage workspace;
+    list *lists;
+    output_controls out_control;
+    evolve_function Evolve;
+    int steps;
+
+    if ( argc != 4 )
+    {
+        usage(argv);
+        exit( INVALID_INPUT );
+    }
+
+    lists = (list*) malloc( sizeof(list) * LIST_N );
+
+    Read_System( argv[1], argv[2], argv[3], &system, &control,
+            &data, &workspace, &out_control );
+
+    Initialize( &system, &control, &data, &workspace, &lists,
+            &out_control, &Evolve );
+
+    /* compute f_0 */
+    //if( control.restart == 0 ) {
+    Reset( &system, &control, &data, &workspace, &lists );
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace,
+            &lists, &out_control );
+
+    //fprintf( stderr, "total: %.2f secs\n", data.timing.nbrs);
+    Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
+    Compute_Kinetic_Energy( &system, &data );
+    Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+    ++data.step;
+    //}
+
+
+    for ( ; data.step <= control.nsteps; data.step++ )
+    {
+        if ( control.T_mode )
+        {
+            Temperature_Control( &control, &data, &out_control );
+        }
+        Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+        Analysis( &system, &control, &data, &workspace, &lists, &out_control );
+
+        steps = data.step - data.prev_steps;
+        if ( steps && out_control.restart_freq &&
+                steps % out_control.restart_freq == 0 )
+            Write_Restart( &system, &control, &data, &workspace, &out_control );
+    }
+
+    if ( out_control.write_steps > 0 )
+    {
+        fclose( out_control.trj );
+        Write_PDB( &system, &(lists[BONDS]), &data, &control, &workspace, &out_control );
+    }
+
+    data.timing.end = Get_Time( );
+    data.timing.elapsed = Get_Timing_Info( data.timing.start );
+    fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
+}
+#endif
+
+
+int main( int argc, char* argv[] )
+{
+    if ( argc != 4 )
+    {
+        usage(argv);
+        exit( INVALID_INPUT );
+    }
+
+#ifdef HAVE_CUDA
+    gpu_main( argc, argv );
+#else
+    cpu_main( argc, argv );
+#endif
 
-    return 0;
+    return SUCCESS;
 }
diff --git a/PuReMD-GPU/src/three_body_interactions.c b/PuReMD-GPU/src/three_body_interactions.c
index 7ac96e057c6c799ba88204f3f6339fe54b3c61da..f128d2a2749ead3d5b9a08e47f45c0538255caac 100644
--- a/PuReMD-GPU/src/three_body_interactions.c
+++ b/PuReMD-GPU/src/three_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,59 +22,63 @@
 #include "three_body_interactions.h"
 
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
 /* calculates the theta angle between i-j-k */
-void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
         real *theta, real *cos_theta )
 {
-    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-    if( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if( *cos_theta < -1. ) *cos_theta  = -1.0;
+    (*cos_theta) = rvec_Dot( dvec_ji, dvec_jk ) / ( d_ji * d_jk );
+    if ( *cos_theta > 1. )
+    {
+        *cos_theta  = 1.0;
+    }
+    if ( *cos_theta < -1. )
+    {
+        *cos_theta  = -1.0;
+    }
 
     (*theta) = ACOS( *cos_theta );
 }
 
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
-void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
-        rvec* dcos_theta_dk )
+void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, rvec* dcos_theta_dk )
 {
     int  t;
     real sqr_d_ji   = SQR(d_ji);
     real sqr_d_jk   = SQR(d_jk);
     real inv_dists  = 1.0 / (d_ji * d_jk);
     real inv_dists3 = POW( inv_dists, 3 );
-    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
+    real dot_dvecs  = rvec_Dot( dvec_ji, dvec_jk );
     real Cdot_inv3  = dot_dvecs * inv_dists3;
 
-    for( t = 0; t < 3; ++t ) {
-        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+    for ( t = 0; t < 3; ++t )
+    {
+        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - Cdot_inv3 * sqr_d_jk * dvec_ji[t];
 
-        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists
+            + Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
 
-        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - Cdot_inv3 * sqr_d_ji * dvec_jk[t];
     }
 
-    /*fprintf( stderr, 
+    /*fprintf( stderr,
       "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
       dvec_jk[t] * inv_dists*/
 }
 
 
-/* this is a 3-body interaction in which the main role is 
+/* this is a 3-body interaction in which the main role is
    played by j which sits in the middle of the other two. */
-void Three_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
+void Three_Body_Interactions( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int  i, j, pi, k, pk, t;
     int  type_i, type_j, type_k;
@@ -123,7 +128,8 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
     p_val10 = system->reaxprm.gp.l[17];
     num_thb_intrs = 0;
 
-    for( j = 0; j < system->N; ++j ) {
+    for ( j = 0; j < system->N; ++j )
+    {
         // fprintf( out_control->eval, "j: %d\n", j );
         type_j = system->atoms[j].type;
         start_j = Start_Index(j, bonds);
@@ -133,21 +139,24 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
 
         SBOp = 0, prod_SBO = 1;
-        for( t = start_j; t < end_j; ++t ) {
+        for ( t = start_j; t < end_j; ++t )
+        {
             bo_jt = &(bond_list[t].bo_data);
             SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
             temp = SQR( bo_jt->BO );
-            temp *= temp; 
+            temp *= temp;
             temp *= temp;
             prod_SBO *= EXP( -temp );
         }
 
         /* modifications to match Adri's code - 09/01/09 */
-        if( workspace->vlpex[j] >= 0 ){
+        if ( workspace->vlpex[j] >= 0 )
+        {
             vlpadj = 0;
             dSBO2 = prod_SBO - 1;
         }
-        else{
+        else
+        {
             vlpadj = workspace->nlp[j];
             dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
         }
@@ -155,65 +164,67 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
         dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
 
-        if( SBO <= 0 )
+        if ( SBO <= 0 )
             SBO2 = 0, CSBO2 = 0;
-        else if( SBO > 0 && SBO <= 1 ) {
+        else if ( SBO > 0 && SBO <= 1 )
+        {
             SBO2 = POW( SBO, p_val9 );
             CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
         }
-        else if( SBO > 1 && SBO < 2 ) {
-            SBO2 = 2 - POW( 2-SBO, p_val9 );
+        else if ( SBO > 1 && SBO < 2 )
+        {
+            SBO2 = 2 - POW( 2 - SBO, p_val9 );
             CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
         }
-        else 
-            SBO2 = 2, CSBO2 = 0;  
+        else
+            SBO2 = 2, CSBO2 = 0;
 
         expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
 
-        /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-           restrictions here. such a restriction would prevent us from producing 
+        /* unlike 2-body intrs where we enforce i<j, we cannot put any such
+           restrictions here. such a restriction would prevent us from producing
            all 4-body intrs correctly */
-        for( pi = start_j; pi < end_j; ++pi ) {
+        for ( pi = start_j; pi < end_j; ++pi )
+        {
             Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-
             pbond_ij = &(bond_list[pi]);
             bo_ij = &(pbond_ij->bo_data);
             BOA_ij = bo_ij->BO - control->thb_cut;
 
 
-            if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
+            if ( BOA_ij/*bo_ij->BO*/ > 0.0 )
+            {
                 i = pbond_ij->nbr;
-                r_ij = pbond_ij->d;     
+                r_ij = pbond_ij->d;
                 type_i = system->atoms[i].type;
                 // fprintf( out_control->eval, "i: %d\n", i );
 
 
                 /* first copy 3-body intrs from previously computed ones where i>k.
-                   IMPORTANT: if it is less costly to compute theta and its 
-                   derivative, we should definitely re-compute them, 
+                   IMPORTANT: if it is less costly to compute theta and its
+                   derivative, we should definitely re-compute them,
                    instead of copying!
-                   in the second for-loop below, we compute only new 3-body intrs 
+                   in the second for-loop below, we compute only new 3-body intrs
                    where i < k */
-                for( pk = start_j; pk < pi; ++pk ) {
+                for ( pk = start_j; pk < pi; ++pk )
+                {
                     // fprintf( out_control->eval, "pk: %d\n", pk );
                     start_pk = Start_Index( pk, thb_intrs );
                     end_pk = End_Index( pk, thb_intrs );
 
-                    for( t = start_pk; t < end_pk; ++t )
-                        if( thb_list[t].thb == i ) {
+                    for ( t = start_pk; t < end_pk; ++t )
+                        if ( thb_list[t].thb == i )
+                        {
                             p_ijk = &(thb_list[num_thb_intrs]);
                             p_kji = &(thb_list[t]);
 
                             p_ijk->thb = bond_list[pk].nbr;
                             p_ijk->pthb  = pk;
-                            p_ijk->theta = p_kji->theta;              
+                            p_ijk->theta = p_kji->theta;
                             rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
                             rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
                             rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
 
-                            //if (j == 12)
-                            //fprintf (stderr, "Adding one for matched atom %d \n", i);
-
                             ++num_thb_intrs;
                             break;
                         }
@@ -221,7 +232,8 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
 
                 /* and this is the second for loop mentioned above */
-                for( pk = pi+1; pk < end_j; ++pk ) {
+                for ( pk = pi + 1; pk < end_j; ++pk )
+                {
                     pbond_jk = &(bond_list[pk]);
                     bo_jk    = &(pbond_jk->bo_data);
                     BOA_jk   = bo_jk->BO - control->thb_cut;
@@ -229,53 +241,55 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                     type_k   = system->atoms[k].type;
                     p_ijk    = &( thb_list[num_thb_intrs] );
 
-                    //TODO - CHANGE ORIGINAL
+                    //CHANGE ORIGINAL
                     if (BOA_jk <= 0) continue;
+                    //CHANGE ORIGINAL
 
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d,
-                            &theta, &cos_theta );
 
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d, 
-                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-                            &(p_ijk->dcos_dk) );
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
+                                     pbond_jk->dvec, pbond_jk->d,
+                                     &theta, &cos_theta );
+
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
+                                          pbond_jk->dvec, pbond_jk->d,
+                                          &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                                          &(p_ijk->dcos_dk) );
 
                     p_ijk->thb = k;
                     p_ijk->pthb = pk;
                     p_ijk->theta = theta;
 
-                    //if (j == 12)
-                    //fprintf (stderr, "Adding one for the rest %d \n", k);
-
                     sin_theta = SIN( theta );
-                    if( sin_theta < 1.0e-5 )
+                    if ( sin_theta < 1.0e-5 )
                         sin_theta = 1.0e-5;
 
                     ++num_thb_intrs;
 
 
-                    if( BOA_jk > 0.0 && 
-                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-                        r_jk = pbond_jk->d;              
+                    if ( BOA_jk > 0.0 &&
+                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/)
+                    {
+                        r_jk = pbond_jk->d;
                         thbh = &( system->reaxprm.thbp[ index_thbp(type_i,type_j,type_k,system->reaxprm.num_atom_types) ] );
                         flag = 0;
 
                         /* if( workspace->orig_id[i] < workspace->orig_id[k] )
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
                            workspace->orig_id[i], workspace->orig_id[j],
                            workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                           else 
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           else
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
                            workspace->orig_id[k], workspace->orig_id[j],
                            workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
 
 
-                        for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-                            // fprintf( out_control->eval, 
+                        for ( cnt = 0; cnt < thbh->cnt; ++cnt )
+                        {
+                            // fprintf( out_control->eval,
                             // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
 
-                            if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                            if ( fabs(thbh->prm[cnt].p_val1) > 0.001 )
+                            {
                                 thbp = &( thbh->prm[cnt] );
 
                                 /* ANGLE ENERGY */
@@ -287,27 +301,27 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
                                 exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
                                 f7_ij = 1.0 - exp3ij;
-                                Cf7ij = p_val3 * p_val4 * 
-                                    POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+                                Cf7ij = p_val3 * p_val4 *
+                                        POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
 
                                 exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
                                 f7_jk = 1.0 - exp3jk;
-                                Cf7jk = p_val3 * p_val4 * 
-                                    POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+                                Cf7jk = p_val3 * p_val4 *
+                                        POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
 
                                 expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
                                 trm8 = 1.0 + expval6 + expval7;
                                 f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
                                 Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-                                    (p_val6 * expval6 * trm8 - 
-                                     (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+                                       (p_val6 * expval6 * trm8 -
+                                        (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
 
-                                theta_0 = 180.0 - 
-                                    theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                                theta_0 = DEG2RAD( theta_0 );              
+                                theta_0 = 180.0 -
+                                          theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                                theta_0 = DEG2RAD( theta_0 );
 
-                                expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-                                if( p_val1 >= 0 )
+                                expval2theta  = EXP(-p_val2 * SQR(theta_0 - theta));
+                                if ( p_val1 >= 0 )
                                     expval12theta = p_val1 * (1.0 - expval2theta);
                                 else // To avoid linear Me-H-Me angles (6/6/06)
                                     expval12theta = p_val1 * -expval2theta;
@@ -315,11 +329,11 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
                                 CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
                                 CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-                                    expval2theta * (theta_0 - theta);
+                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
+                                         expval2theta * (theta_0 - theta);
 
-                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-                                    exp( -p_val10 * (2.0 - SBO2) );
+                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
+                                           exp( -p_val10 * (2.0 - SBO2) );
 
                                 CEval5 = -CEval4 * Ctheta_0 * CSBO2;
                                 CEval6 = CEval5 * dSBO1;
@@ -342,13 +356,13 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
                                 trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
                                 f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 -
                                         (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-                                            p_pen4 * exp_pen4 )) /
-                                    SQR( trm_pen34 );
+                                                             p_pen4 * exp_pen4 )) /
+                                       SQR( trm_pen34 );
 
-                                data->E_Pen += e_pen = 
-                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                data->E_Pen += e_pen =
+                                                   p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
 
                                 CEpen1 = e_pen * Cf9j / f9_Dj;
                                 temp   = -2.0 * p_pen2 * e_pen;
@@ -364,66 +378,64 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 p_coa4 = system->reaxprm.gp.l[30];
 
                                 exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-                                data->E_Coa += e_coa = 
-                                    p_coa1 / (1. + exp_coa2) *
-                                    EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-                                    EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                data->E_Coa += e_coa =
+                                                   p_coa1 / (1. + exp_coa2) *
+                                                   EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) *
+                                                   EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) *
+                                                   EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                                                   EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
 
                                 CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
                                 CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-                                CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-                                CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                                CEcoa4 = -2 * p_coa3 * (total_bo[i] - BOA_ij) * e_coa;
+                                CEcoa5 = -2 * p_coa3 * (total_bo[k] - BOA_jk) * e_coa;
                                 /* END COALITION ENERGY */
 
                                 /* FORCES */
-                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
-                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
-                                workspace->CdDelta[j] += ((CEval3 + CEval7) + 
-                                        CEpen1 + CEcoa3);
+                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                                workspace->CdDelta[j] += ((CEval3 + CEval7) +
+                                                          CEpen1 + CEcoa3);
                                 workspace->CdDelta[i] += CEcoa4;
-                                workspace->CdDelta[k] += CEcoa5;              
+                                workspace->CdDelta[k] += CEcoa5;
 
-                                for( t = start_j; t < end_j; ++t ) {
+                                for ( t = start_j; t < end_j; ++t )
+                                {
                                     pbond_jt = &( bond_list[t] );
                                     bo_jt = &(pbond_jt->bo_data);
                                     temp_bo_jt = bo_jt->BO;
                                     temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    pBOjt7 = temp * temp * temp_bo_jt;
 
-                                    // fprintf( out_control->eval, "%6d%12.8f\n", 
-                                    // workspace->orig_id[ bond_list[t].nbr ], 
+                                    // fprintf( out_control->eval, "%6d%12.8f\n",
+                                    // workspace->orig_id[ bond_list[t].nbr ],
                                     //    (CEval6 * pBOjt7) );
 
                                     bo_jt->Cdbo += (CEval6 * pBOjt7);
                                     bo_jt->Cdbopi += CEval5;
                                     bo_jt->Cdbopi2 += CEval5;
-                                }              
-
+                                }
 
-                                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
 
+                                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                                {
                                     rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
                                     rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
                                     rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
-
-                                    /*
-                                       if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
-                                       if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
-                                     */
                                 }
-                                else {
+                                else
+                                {
                                     /* terms not related to bond order derivatives
-                                       are added directly into 
+                                       are added directly into
                                        forces and pressure vector/tensor */
                                     rvec_Scale( force, CEval8, p_ijk->dcos_di );
                                     rvec_Add( system->atoms[i].f, force );
                                     rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                     rvec_Add( data->ext_press, ext_press );
 
-                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( system->atoms[j].f,
+                                                    CEval8, p_ijk->dcos_dj );
 
                                     rvec_Scale( force, CEval8, p_ijk->dcos_dk );
                                     rvec_Add( system->atoms[k].f, force );
@@ -432,96 +444,97 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
 
                                     /* This part is for a fully-flexible box */
-                                    /* rvec_OuterProduct( temp_rtensor, 
+                                    /* rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_di, system->atoms[i].x );
                                        rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
 
-                                       rvec_OuterProduct( temp_rtensor, 
+                                       rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_dj, system->atoms[j].x );
                                        rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
 
-                                       rvec_OuterProduct( temp_rtensor, 
+                                       rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_dk, system->atoms[k].x );
                                        rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
 
                                        if( pbond_ij->imaginary || pbond_jk->imaginary )
-                                       rtensor_ScaledAdd( data->flex_bar.P, 
+                                       rtensor_ScaledAdd( data->flex_bar.P,
                                        -1.0, total_rtensor );
                                        else
                                        rtensor_Add( data->flex_bar.P, total_rtensor ); */
                                 }
 
 #ifdef TEST_ENERGY
-                                fprintf( out_control->eval, 
-                                        //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-                                        i+1, j+1, k+1,
-                                        //workspace->orig_id[i]+1,  
-                                        //workspace->orig_id[j]+1,
-                                        //workspace->orig_id[k]+1,
-                                        //workspace->Delta_boc[j], 
-                                        RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-                                        e_ang, data->E_Ang );
-
-                                /*fprintf( out_control->eval, 
+                                fprintf( out_control->eval,
+                                         //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                                         "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                                         i + 1, j + 1, k + 1,
+                                         //workspace->orig_id[i]+1,
+                                         //workspace->orig_id[j]+1,
+                                         //workspace->orig_id[k]+1,
+                                         //workspace->Delta_boc[j],
+                                         RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                                         e_ang, data->E_Ang );
+
+                                /*fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e",
                                   p_val3, p_val4, BOA_ij, BOA_jk );
-                                  fprintf( out_control->eval, 
+                                  fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e",
                                   f7_ij, f7_jk, f8_Dj, expval12theta );
-                                  fprintf( out_control->eval, 
+                                  fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
                                   CEval1, CEval2, CEval3, CEval4, CEval5
-                                //CEval6, CEval7, CEval8  );*/
+                                  //CEval6, CEval7, CEval8  );*/
 
-                                /*fprintf( out_control->eval, 
+                                /*fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  -p_ijk->dcos_di[0]/sin_theta, 
-                                  -p_ijk->dcos_di[1]/sin_theta, 
-                                  -p_ijk->dcos_di[2]/sin_theta, 
-                                  -p_ijk->dcos_dj[0]/sin_theta, 
-                                  -p_ijk->dcos_dj[1]/sin_theta, 
-                                  -p_ijk->dcos_dj[2]/sin_theta, 
-                                  -p_ijk->dcos_dk[0]/sin_theta, 
-                                  -p_ijk->dcos_dk[1]/sin_theta, 
+                                  -p_ijk->dcos_di[0]/sin_theta,
+                                  -p_ijk->dcos_di[1]/sin_theta,
+                                  -p_ijk->dcos_di[2]/sin_theta,
+                                  -p_ijk->dcos_dj[0]/sin_theta,
+                                  -p_ijk->dcos_dj[1]/sin_theta,
+                                  -p_ijk->dcos_dj[2]/sin_theta,
+                                  -p_ijk->dcos_dk[0]/sin_theta,
+                                  -p_ijk->dcos_dk[1]/sin_theta,
                                   -p_ijk->dcos_dk[2]/sin_theta );*/
 
-                                /* fprintf( out_control->epen, 
-                                   "%23.15e%23.15e%23.15e\n", 
+                                /* fprintf( out_control->epen,
+                                   "%23.15e%23.15e%23.15e\n",
                                    CEpen1, CEpen2, CEpen3 );
-                                   fprintf( out_control->epen, 
+                                   fprintf( out_control->epen,
                                    "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
                                    workspace->orig_id[i],  workspace->orig_id[j],
-                                   workspace->orig_id[k], RAD2DEG(theta), 
+                                   workspace->orig_id[k], RAD2DEG(theta),
                                    BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
 
-                                fprintf( out_control->ecoa, 
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                        workspace->orig_id[i], 
-                                        workspace->orig_id[j],
-                                        workspace->orig_id[k], 
-                                        RAD2DEG(theta), BOA_ij, BOA_jk, 
-                                        e_coa, data->E_Coa );
+                                fprintf( out_control->ecoa,
+                                         "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                         workspace->orig_id[i],
+                                         workspace->orig_id[j],
+                                         workspace->orig_id[k],
+                                         RAD2DEG(theta), BOA_ij, BOA_jk,
+                                         e_coa, data->E_Coa );
 #endif
 
 #ifdef TEST_FORCES            /* angle forces */
                                 Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
                                 Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-                                Add_dDelta( system, lists, 
-                                        j, CEval3 + CEval7, workspace->f_ang );
+                                Add_dDelta( system, lists,
+                                            j, CEval3 + CEval7, workspace->f_ang );
 
-                                for( t = start_j; t < end_j; ++t ) {
+                                for ( t = start_j; t < end_j; ++t )
+                                {
                                     pbond_jt = &( bond_list[t] );
                                     bo_jt = &(pbond_jt->bo_data);
                                     temp_bo_jt = bo_jt->BO;
                                     temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    pBOjt7 = temp * temp * temp_bo_jt;
 
                                     Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                                            workspace->f_ang );
-                                    Add_dBOpinpi2( system, lists, j, t, 
-                                            CEval5, CEval5, 
-                                            workspace->f_ang, workspace->f_ang );
+                                             workspace->f_ang );
+                                    Add_dBOpinpi2( system, lists, j, t,
+                                                   CEval5, CEval5,
+                                                   workspace->f_ang, workspace->f_ang );
                                 }
 
                                 rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
@@ -536,10 +549,10 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 /* end penalty forces */
 
                                 /* coalition forces */
-                                Add_dBO( system, lists, 
-                                        j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-                                Add_dBO( system, lists, 
-                                        j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                                Add_dBO( system, lists,
+                                         j, pi, CEcoa1 - CEcoa4, workspace->f_coa );
+                                Add_dBO( system, lists,
+                                         j, pk, CEcoa2 - CEcoa5, workspace->f_coa );
                                 Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
                                 Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
                                 Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
@@ -555,32 +568,36 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         }
     }
 
-    if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+
+    if ( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE )
+    {
         workspace->realloc.num_3body = num_thb_intrs;
-        if( num_thb_intrs > thb_intrs->num_intrs ) {
+        if ( num_thb_intrs > thb_intrs->num_intrs )
+        {
             fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-                    data->step, num_thb_intrs, thb_intrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
+                     data->step, num_thb_intrs, thb_intrs->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
-    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n",
     // data->step, num_thb_intrs );
 #ifdef TEST_ENERGY
-    fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+    fprintf( stderr, "Number of angle interactions: %d\n", num_thb_intrs );
 
-    fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-            data->E_Ang, data->E_Pen, data->E_Coa );
+    fprintf( stderr, "Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+             data->E_Ang, data->E_Pen, data->E_Coa );
 
-    fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+    fprintf( stderr, "3body: ext_press (%23.15e %23.15e %23.15e)\n",
+             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
 
 
-void Hydrogen_Bonds( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+void Hydrogen_Bonds( reax_system *system, control_params *control,
+                     simulation_data *data, static_storage *workspace,
+                     list **lists, output_controls *out_control )
 {
     int i, j, k, pi, pk, itr, top;
     int type_i, type_j, type_k;
@@ -610,10 +627,11 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
        here j is H atom and there has to be some bond between i and j.
        Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map 
+       so in this function i->X, j->H, k->Z when we map
        variables onto the ones in the handout.*/
-    for( j = 0; j < system->N; ++j )
-        if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
+    for ( j = 0; j < system->N; ++j )
+        if ( system->reaxprm.sbp[system->atoms[j].type].p_hbond == 1 ) // j must be H
+        {
             /*set j's variables */
             type_j  = system->atoms[j].type;
             start_j = Start_Index(j, bonds);
@@ -622,21 +640,23 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
             hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
 
             top = 0;
-            for( pi = start_j; pi < end_j; ++pi ) {
+            for ( pi = start_j; pi < end_j; ++pi )
+            {
                 pbond_ij = &( bond_list[pi] );
                 i = pbond_ij->nbr;
                 bo_ij = &(pbond_ij->bo_data);
                 type_i = system->atoms[i].type;
 
-                if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
+                if ( system->reaxprm.sbp[type_i].p_hbond == 2 &&
                         bo_ij->BO >= HB_THRESHOLD )
                     hblist[top++] = pi;
             }
 
-            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
             //          j, top, hb_start_j, hb_end_j );
 
-            for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+            for ( pk = hb_start_j; pk < hb_end_j; ++pk )
+            {
                 /* set k's varibles */
                 k = hbond_list[pk].nbr;
                 type_k = system->atoms[k].type;
@@ -644,69 +664,59 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 r_jk = nbr_jk->d;
                 rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
 
-                for( itr=0; itr < top; ++itr ) {
+                for ( itr = 0; itr < top; ++itr )
+                {
                     pi = hblist[itr];
                     pbond_ij = &( bond_list[pi] );
                     i = pbond_ij->nbr;
 
-                    if( i != k ) {
+                    if ( i != k )
+                    {
                         bo_ij = &(pbond_ij->bo_data);
                         type_i = system->atoms[i].type;
-                        r_ij = pbond_ij->d;         
+                        r_ij = pbond_ij->d;
                         hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, system->reaxprm.num_atom_types) ]);
                         ++num_hb_intrs;
 
-                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &theta, &cos_theta );
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta );
                         /* the derivative of cos(theta) */
                         Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &dcos_theta_di, &dcos_theta_dj, 
-                                &dcos_theta_dk );
+                                &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
 
                         /* hydrogen bond energy*/
-                        sin_theta2 = SIN( theta/2.0 );
+                        sin_theta2 = SIN( theta / 2.0 );
                         sin_xhz4 = SQR(sin_theta2);
                         sin_xhz4 *= sin_xhz4;
                         cos_xhz1 = ( 1.0 - cos_theta );
                         exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-                                    r_jk / hbp->r0_hb - 2.0 ) );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk
+                                    + r_jk / hbp->r0_hb - 2.0 ) );
 
-                        data->E_HB += e_hb = 
-                            hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        data->E_HB += e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
 
-                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-                                1.0 / hbp->r0_hb);
+                        CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+                        CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb);
 
                         /* hydrogen bond forces */
                         bo_ij->Cdbo += CEhb1;   // dbo term
 
-                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-                            rvec_ScaledAdd( system->atoms[i].f, 
-                                    +CEhb2, dcos_theta_di ); //dcos terms
-                            rvec_ScaledAdd( system->atoms[j].f, 
-                                    +CEhb2, dcos_theta_dj );
-
-
-
-
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, 
-                                    +CEhb2, dcos_theta_dk );
-
+                        if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                        {
+                            rvec_ScaledAdd( system->atoms[i].f,
+                                            +CEhb2, dcos_theta_di ); //dcos terms
+                            rvec_ScaledAdd( system->atoms[j].f,
+                                            +CEhb2, dcos_theta_dj );
+                            rvec_ScaledAdd( system->atoms[k].f,
+                                            +CEhb2, dcos_theta_dk );
                             //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
-
-
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3 / r_jk, dvec_jk );
                         }
                         else
                         {
-                            /* for pressure coupling, terms that are not related 
-                               to bond order derivatives are added directly into 
+                            /* for pressure coupling, terms that are not related
+                               to bond order derivatives are added directly into
                                pressure vector/tensor */
                             rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
                             rvec_Add( system->atoms[i].f, force );
@@ -717,39 +727,32 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
 
                             ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
                             rvec_Scale( force, +CEhb2, dcos_theta_dk );
-
-
-
-                            //TODO
                             rvec_Add( system->atoms[k].f, force );
-
-
-
                             rvec_iMultiply( ext_press, rel_jk, force );
                             rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
                             //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
 
-                            rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                            rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
                             rvec_Add( system->atoms[k].f, force );
                             rvec_iMultiply( ext_press, rel_jk, force );
                             rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
                             /* This part is intended for a fully-flexible box */
-                            /* rvec_OuterProduct( temp_rtensor, 
+                            /* rvec_OuterProduct( temp_rtensor,
                                dcos_theta_di, system->atoms[i].x );
                                rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
 
                                rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
                                -CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
+                               rvec_OuterProduct( temp_rtensor,
                                temp_rvec, system->atoms[j].x );
                                rtensor_Add( total_rtensor, temp_rtensor );
 
                                rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
                                +CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
+                               rvec_OuterProduct( temp_rtensor,
                                temp_rvec, system->atoms[k].x );
                                rtensor_Add( total_rtensor, temp_rtensor );
 
@@ -760,38 +763,38 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                         }
 
 #ifdef TEST_ENERGY
-                        /*fprintf( out_control->ehb, 
+                        /*fprintf( out_control->ehb,
                           "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
-                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
+                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2],
+                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2],
                           dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
                           fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
                           CEhb1, CEhb2, CEhb3 ); */
-                        fprintf( stderr, //out_control->ehb, 
-                                "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                workspace->orig_id[i], 
-                                workspace->orig_id[j], 
-                                workspace->orig_id[k], 
-                                r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
+                        fprintf( stderr, //out_control->ehb,
+                                 "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                 workspace->orig_id[i],
+                                 workspace->orig_id[j],
+                                 workspace->orig_id[k],
+                                 r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
 
 #endif
 #ifdef TEST_FORCES
                         // dbo term
                         Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
                         // dcos terms
-                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
                         rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
                         rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
                         // dr terms
-                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3 / r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                 }
             }
         }
 
-    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
+    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 
 #ifdef TEST_FORCES
diff --git a/PuReMD-GPU/src/tool_box.c b/PuReMD-GPU/src/tool_box.c
new file mode 100644
index 0000000000000000000000000000000000000000..1782e71cdac028ddb5c0947ef371cf9259bfad2b
--- /dev/null
+++ b/PuReMD-GPU/src/tool_box.c
@@ -0,0 +1,467 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "tool_box.h"
+
+#include <ctype.h>
+
+
+/************** taken from box.c **************/
+void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
+{
+    int i, j;
+    real tmp;
+
+    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
+
+    if (flag > 0)
+    {
+        for (i = 0; i < 3; i++)
+        {
+            tmp = 0.0;
+            for (j = 0; j < 3; j++)
+                tmp += box->trans[i][j] * x1[j];
+            x2[i] = tmp;
+        }
+    }
+    else
+    {
+        for (i = 0; i < 3; i++)
+        {
+            tmp = 0.0;
+            for (j = 0; j < 3; j++)
+                tmp += box->trans_inv[i][j] * x1[j];
+            x2[i] = tmp;
+        }
+    }
+    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);
+}
+
+
+void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
+{
+    Transform( x1, box, flag, x2 );
+
+    x2[0] /= box->box_norms[0];
+    x2[1] /= box->box_norms[1];
+    x2[2] /= box->box_norms[2];
+}
+
+
+/* determine whether point p is inside the box */
+void Fit_to_Periodic_Box( simulation_box *box, rvec *p )
+{
+    int i;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        //TODO: verify box boundary coordinates -- assuming orthogonal box pinned at origin
+        if ( (*p)[i] < 0. )
+        {
+            /* handle lower coords */
+            while ( (*p)[i] < 0. )
+                (*p)[i] += box->box_norms[i];
+        }
+        else if ( (*p)[i] >= box->box_norms[i] )
+        {
+            /* handle higher coords */
+            while ( (*p)[i] >= box->box_norms[i] )
+                (*p)[i] -= box->box_norms[i];
+        }
+//        if ( (*p)[i] < box->min[i] )
+//        {
+//            /* handle lower coords */
+//            while ( (*p)[i] < box->min[i] )
+//                (*p)[i] += box->box_norms[i];
+//        }
+//        else if ( (*p)[i] >= box->max[i] )
+//        {
+//            /* handle higher coords */
+//            while ( (*p)[i] >= box->max[i] )
+//                (*p)[i] -= box->box_norms[i];
+//        }
+    }
+}
+
+
+/* determine the touch point, tp, of a box to
+   its neighbor denoted by the relative coordinate rl */
+/*
+inline void Box_Touch_Point( simulation_box *box, ivec rl, rvec tp )
+{
+    int d;
+
+    for ( d = 0; d < 3; ++d )
+        if ( rl[d] == -1 )
+            tp[d] = box->min[d];
+        else if ( rl[d] == 0 )
+            tp[d] = NEG_INF - 1.;
+        else
+            tp[d] = box->max[d];
+}
+*/
+
+
+/* determine whether point p is inside the box */
+/* assumes orthogonal box */
+/*
+inline int is_Inside_Box( simulation_box *box, rvec p )
+{
+    if ( p[0] < box->min[0] || p[0] >= box->max[0] ||
+            p[1] < box->min[1] || p[1] >= box->max[1] ||
+            p[2] < box->min[2] || p[2] >= box->max[2] )
+        return FALSE;
+
+    return TRUE;
+}
+*/
+
+
+/*
+inline int iown_midpoint( simulation_box *box, rvec p1, rvec p2 )
+{
+    rvec midp;
+
+    midp[0] = (p1[0] + p2[0]) / 2;
+    midp[1] = (p1[1] + p2[1]) / 2;
+    midp[2] = (p1[2] + p2[2]) / 2;
+
+    if ( midp[0] < box->min[0] || midp[0] >= box->max[0] ||
+            midp[1] < box->min[1] || midp[1] >= box->max[1] ||
+            midp[2] < box->min[2] || midp[2] >= box->max[2] )
+        return FALSE;
+
+    return TRUE;
+}
+*/
+
+
+/**************** from grid.c ****************/
+/* finds the closest point of grid cell cj to ci.
+   no need to consider periodic boundary conditions as in the serial case
+   because the box of a process is not periodic in itself */
+/*
+inline void GridCell_Closest_Point( grid_cell *gci, grid_cell *gcj,
+        ivec ci, ivec cj, rvec cp )
+{
+    int  d;
+
+    for ( d = 0; d < 3; d++ )
+        if ( cj[d] > ci[d] )
+            cp[d] = gcj->min[d];
+        else if ( cj[d] == ci[d] )
+            cp[d] = NEG_INF - 1.;
+        else
+            cp[d] = gcj->max[d];
+}
+
+
+inline void GridCell_to_Box_Points( grid_cell *gc, ivec rl, rvec cp, rvec fp )
+{
+    int d;
+
+    for ( d = 0; d < 3; ++d )
+        if ( rl[d] == -1 )
+        {
+            cp[d] = gc->min[d];
+            fp[d] = gc->max[d];
+        }
+        else if ( rl[d] == 0 )
+        {
+            cp[d] = fp[d] = NEG_INF - 1.;
+        }
+        else
+        {
+            cp[d] = gc->max[d];
+            fp[d] = gc->min[d];
+        }
+}
+
+
+inline real DistSqr_between_Special_Points( rvec sp1, rvec sp2 )
+{
+    int  i;
+    real d_sqr = 0;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        if ( sp1[i] > NEG_INF && sp2[i] > NEG_INF )
+        {
+            d_sqr += SQR( sp1[i] - sp2[i] );
+        }
+    }
+
+    return d_sqr;
+}
+
+
+inline real DistSqr_to_Special_Point( rvec cp, rvec x )
+{
+    int  i;
+    real d_sqr = 0;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        if ( cp[i] > NEG_INF )
+        {
+            d_sqr += SQR( cp[i] - x[i] );
+        }
+    }
+
+    return d_sqr;
+}
+
+
+inline int Relative_Coord_Encoding( ivec c )
+{
+    return 9 * (c[0] + 1) + 3 * (c[1] + 1) + (c[2] + 1);
+}
+*/
+
+
+/************** from geo_tools.c *****************/
+void Make_Point( real x, real y, real z, rvec* p )
+{
+    (*p)[0] = x;
+    (*p)[1] = y;
+    (*p)[2] = z;
+}
+
+
+int is_Valid_Serial( static_storage *workspace, int serial )
+{
+    if( workspace->map_serials[ serial ] < 0 )
+    {
+        fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n", serial );
+        fprintf( stderr, "Please correct the input file.Terminating...\n" );
+        exit( INVALID_INPUT );
+    }
+
+    return TRUE;
+}
+
+
+int Check_Input_Range( int val, int lo, int hi, char *message )
+{
+    if ( val < lo || val > hi )
+    {
+        fprintf( stderr, "%s\nInput %d - Out of range %d-%d. Terminating...\n",
+                 message, val, lo, hi );
+        exit( INVALID_INPUT );
+    }
+
+    return SUCCESS;
+}
+
+
+void Trim_Spaces( char *element )
+{
+    int i, j;
+
+    for ( i = 0; element[i] == ' '; ++i ); // skip initial space chars
+
+    for ( j = i; j < (int)(strlen(element)) && element[j] != ' '; ++j )
+    {
+        element[j - i] = toupper( element[j] ); // make uppercase, offset to 0
+    }
+    element[j - i] = 0; // finalize the string
+}
+
+
+/************ from system_props.c *************/
+real Get_Time( )
+{
+    gettimeofday(&tim, NULL );
+    return ( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+}
+
+
+real Get_Timing_Info( real t_start )
+{
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    return (t_end - t_start);
+}
+
+
+void Update_Timing_Info( real *t_start, real *timing )
+{
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    *timing += (t_end - *t_start);
+    *t_start = t_end;
+}
+
+
+/*********** from io_tools.c **************/
+int Get_Atom_Type( reax_interaction *reax_param, char *s )
+{
+    int i;
+
+    for ( i = 0; i < reax_param->num_atom_types; ++i )
+    {
+        if ( !strcmp( reax_param->sbp[i].name, s ) )
+        {
+            return i;
+        }
+    }
+
+    fprintf( stderr, "Unknown atom type %s. Terminating...\n", s );
+    exit( UNKNOWN_ATOM_TYPE );
+
+    return FAILURE;
+}
+
+
+char *Get_Element( reax_system *system, int i )
+{
+    return &( system->reaxprm.sbp[system->atoms[i].type].name[0] );
+}
+
+
+char *Get_Atom_Name( reax_system *system, int i )
+{
+    return &(system->atoms[i].name[0]);
+}
+
+
+int Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
+{
+    int i;
+
+    if ( (*line = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    if ( (*backup = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    if ( (*tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        if ( ((*tokens)[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN)) == NULL )
+        {
+            return FAILURE;
+        }
+    }
+
+    return SUCCESS;
+}
+
+
+int Tokenize( char* s, char*** tok )
+{
+    char test[MAX_LINE];
+    char *sep = "\t \n!=";
+    char *word;
+    int count = 0;
+
+    strncpy( test, s, MAX_LINE );
+
+    for ( word = strtok(test, sep); word; word = strtok(NULL, sep) )
+    {
+        strncpy( (*tok)[count], word, MAX_LINE );
+        count++;
+    }
+
+    return count;
+}
+
+
+/***************** taken from lammps ************************/
+/* safe malloc */
+void *smalloc( long n, char *name )
+{
+    void *ptr;
+
+    if ( n <= 0 )
+    {
+        fprintf( stderr, "WARNING: trying to allocate %ld bytes for array %s. ",
+                 n, name );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    ptr = malloc( n );
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "ERROR: failed to allocate %ld bytes for array %s",
+                 n, name );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    return ptr;
+}
+
+
+/* safe calloc */
+void *scalloc( int n, int size, char *name )
+{
+    void *ptr;
+
+    if ( n <= 0 )
+    {
+        fprintf( stderr, "WARNING: trying to allocate %d elements for array %s. ",
+                 n, name );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    if ( size <= 0 )
+    {
+        fprintf( stderr, "WARNING: elements size for array %s is %d. ",
+                 name, size );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    ptr = calloc( n, size );
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "ERROR: failed to allocate %d bytes for array %s",
+                 n * size, name );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    return ptr;
+}
+
+
+/* safe free */
+void sfree( void *ptr, char *name )
+{
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "WARNING: trying to free the already NULL pointer %s!\n",
+                 name );
+        return;
+    }
+
+    free( ptr );
+    ptr = NULL;
+}
diff --git a/PuReMD-GPU/src/tool_box.h b/PuReMD-GPU/src/tool_box.h
new file mode 100644
index 0000000000000000000000000000000000000000..db97076149a5f5c8868d02d299a25581d3b5a934
--- /dev/null
+++ b/PuReMD-GPU/src/tool_box.h
@@ -0,0 +1,72 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __TOOL_BOX_H_
+#define __TOOL_BOX_H_
+
+#include "mytypes.h"
+
+struct timeval tim;
+real t_end;
+
+
+/* from box.h */
+void Transform( rvec, simulation_box*, char, rvec );
+void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
+void Fit_to_Periodic_Box( simulation_box*, rvec* );
+//void Box_Touch_Point( simulation_box*, ivec, rvec );
+//int  is_Inside_Box( simulation_box*, rvec );
+//int  iown_midpoint( simulation_box*, rvec, rvec );
+
+/* from grid.h */
+/*
+void GridCell_Closest_Point( grid_cell*, grid_cell*, ivec, ivec, rvec );
+void GridCell_to_Box_Points( grid_cell*, ivec, rvec, rvec );
+real DistSqr_between_Special_Points( rvec, rvec );
+real DistSqr_to_Special_Point( rvec, rvec );
+int Relative_Coord_Encoding( ivec );
+*/
+
+/* from geo_tools.h */
+void Make_Point( real, real, real, rvec* );
+int is_Valid_Serial( static_storage*, int );
+int Check_Input_Range( int, int, int, char* );
+void Trim_Spaces( char* );
+
+/* from system_props.h */
+real Get_Time( );
+real Get_Timing_Info( real );
+void Update_Timing_Info( real*, real* );
+
+/* from io_tools.h */
+int Get_Atom_Type( reax_interaction*, char* );
+char *Get_Element( reax_system*, int );
+char *Get_Atom_Name( reax_system*, int );
+int Allocate_Tokenizer_Space( char**, char**, char*** );
+int Tokenize( char*, char*** );
+
+/* from lammps */
+void *smalloc( long, char* );
+void *scalloc( int, int, char* );
+void sfree( void*, char* );
+
+
+#endif
diff --git a/PuReMD-GPU/src/traj.c b/PuReMD-GPU/src/traj.c
index 2844c370ee79702ed0c75d090afe545149aae185..f8852d5d2cc2e425b67f6ffa6871b52c2f755046 100644
--- a/PuReMD-GPU/src/traj.c
+++ b/PuReMD-GPU/src/traj.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -30,7 +31,8 @@
 /************************************************/
 /*      CUSTOM FORMAT ROUTINES                  */
 /************************************************/
-int Write_Custom_Header(reax_system *system, control_params *control, 
+
+int Write_Custom_Header(reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control)
 {
     int i, header_len, control_block_len, frame_format_len;
@@ -40,119 +42,129 @@ int Write_Custom_Header(reax_system *system, control_params *control,
     char atom_format[100], bond_format[100], angle_format[100];
 
     sprintf( control_block, CONTROL_BLOCK,
-            system->N,
-            control->restart,
-            control->restart_from,
-            control->random_vel,
-            out_control->restart_freq,
-            control->ensemble,
-            control->nsteps,
-            control->dt,
-            control->reposition_atoms,
-            control->restrict_bonds,
-            control->tabulate,
-            control->nbr_cut,
-            control->r_cut,
-            control->bg_cut,
-            control->bo_cut,
-            control->thb_cut,
-            control->hb_cut,
-            control->q_err,
-            control->T_init,
-            control->T_final,
-            control->Tau_T,
-            control->T_mode,
-            control->T_rate,
-            control->T_freq,
-            control->P[0], control->P[1], control->P[2], 
-            control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
-            control->compressibility,
-            control->press_mode,
-            control->remove_CoM_vel,
-            out_control->write_steps,
-            out_control->traj_compress,
-            out_control->traj_format,
-            out_control->atom_format,
-            out_control->bond_info,
-            out_control->angle_info,
-            out_control->energy_update_freq,
-            control->molec_anal,
-            control->freq_molec_anal );
-
-            control_block_len = strlen( control_block );
-
-
-            sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", 
-                    NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
-
-            atom_format[0] = OPT_NOATOM;
-            switch( out_control->atom_format )
-            {
-                case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
-                             break;
-                case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
-                          break;
-                case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
-                          break;
-                case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
-                            break;
-            }
-            strcat( frame_format, atom_format );
-
-            bond_format[0] = OPT_NOBOND;
-            if( out_control->bond_info == OPT_BOND_BASIC )
-                sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
-            else if( out_control->bond_info == OPT_BOND_FULL )
-                sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
-            strcat( frame_format, bond_format );
+             system->N,
+             control->restart,
+             control->restart_from,
+             control->random_vel,
+             out_control->restart_freq,
+             control->ensemble,
+             control->nsteps,
+             control->dt,
+             control->reposition_atoms,
+             control->restrict_bonds,
+             control->tabulate,
+             control->nbr_cut,
+             control->r_cut,
+             control->bg_cut,
+             control->bo_cut,
+             control->thb_cut,
+             control->hb_cut,
+             control->qeq_solver_q_err,
+             control->T_init,
+             control->T_final,
+             control->Tau_T,
+             control->T_mode,
+             control->T_rate,
+             control->T_freq,
+             control->P[0], control->P[1], control->P[2],
+             control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
+             control->compressibility,
+             control->press_mode,
+             control->remove_CoM_vel,
+             out_control->write_steps,
+             out_control->traj_compress,
+             out_control->traj_format,
+             out_control->atom_format,
+             out_control->bond_info,
+             out_control->angle_info,
+             out_control->energy_update_freq,
+             control->molec_anal,
+             control->freq_molec_anal );
+
+    control_block_len = strlen( control_block );
+
+    sprintf( frame_format, "Frame Format: %d\n%s\n%s\n",
+             NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
+
+    atom_format[0] = OPT_NOATOM;
+    switch ( out_control->atom_format )
+    {
+    case OPT_ATOM_BASIC:
+        sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
+        break;
+    case OPT_ATOM_wF:
+        sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
+        break;
+    case OPT_ATOM_wV:
+        sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
+        break;
+    case OPT_ATOM_FULL:
+        sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
+        break;
+    }
+    strcat( frame_format, atom_format );
 
-            angle_format[0] = OPT_NOANGLE;
-            if( out_control->angle_info == OPT_ANGLE_BASIC )
-                sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
-            strcat( frame_format, angle_format );
+    bond_format[0] = OPT_NOBOND;
+    if ( out_control->bond_info == OPT_BOND_BASIC )
+    {
+        sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
+    }
+    else if ( out_control->bond_info == OPT_BOND_FULL )
+    {
+        sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
+    }
+    strcat( frame_format, bond_format );
 
-            frame_format_len = strlen( frame_format );
+    angle_format[0] = OPT_NOANGLE;
+    if ( out_control->angle_info == OPT_ANGLE_BASIC )
+    {
+        sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
+    }
+    strcat( frame_format, angle_format );
 
+    frame_format_len = strlen( frame_format );
 
-            header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ 
-                (frame_format_len + SIZE_INFO_LEN2) + 
-                (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
+    header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2) +
+                 (frame_format_len + SIZE_INFO_LEN2) +
+                 (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
 
-            out_control->write( out_control->trj, HEADER_INIT, 
-                    header_len, HEADER_INIT_LEN, out_control->traj_title );
+    out_control->write( out_control->trj, HEADER_INIT,
+                        header_len, HEADER_INIT_LEN, out_control->traj_title );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2,
-                    control_block_len + (frame_format_len + SIZE_INFO_LEN2) + 
-                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-                    control_block_len );
-            out_control->write( out_control->trj, "%s", control_block );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        control_block_len + (frame_format_len + SIZE_INFO_LEN2) +
+                        (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2),
+                        control_block_len );
+    out_control->write( out_control->trj, "%s", control_block );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-                    frame_format_len + 
-                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-                    frame_format_len );
-            out_control->write( out_control->trj, "%s", frame_format );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        frame_format_len +
+                        (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2),
+                        frame_format_len );
+    out_control->write( out_control->trj, "%s", frame_format );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-                    ATOM_MAPPING_LEN * system->N, 
-                    ATOM_MAPPING_LEN * system->N );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        ATOM_MAPPING_LEN * system->N,
+                        ATOM_MAPPING_LEN * system->N );
 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_MAPPING,  
-                        workspace->orig_id[i], 
-                        system->atoms[i].type, 
-                        system->atoms[i].name, 
-                        system->reaxprm.sbp[ system->atoms[i].type ].mass ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        out_control->write( out_control->trj, ATOM_MAPPING,
+                            workspace->orig_id[i],
+                            system->atoms[i].type,
+                            system->atoms[i].name,
+                            system->reaxprm.sbp[ system->atoms[i].type ].mass );
+    }
 
-            fflush( out_control->trj );
+    fflush( out_control->trj );
 
-            return 0;
+    return 0;
 }
 
 
-int Append_Custom_Frame( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+int Append_Custom_Frame( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i, j, pi, pk, pk_j;
     int write_atoms, write_bonds, write_angles;
@@ -166,278 +178,326 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
 
 
     /* IMPORTANT: This whole part will go to init_trj after finalized! */
-    switch( out_control->atom_format )
+    switch ( out_control->atom_format )
     {
-        case OPT_ATOM_BASIC: 
-            atom_line_len = ATOM_BASIC_LEN;
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_wF: 
-            atom_line_len = ATOM_wF_LEN; 
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_wV: 
-            atom_line_len = ATOM_wV_LEN; 
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_FULL: 
-            atom_line_len = ATOM_FULL_LEN; 
-            write_atoms = 1;
-            break;
-        default: 
-            atom_line_len = 0;
-            write_atoms = 0;
+    case OPT_ATOM_BASIC:
+        atom_line_len = ATOM_BASIC_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_wF:
+        atom_line_len = ATOM_wF_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_wV:
+        atom_line_len = ATOM_wV_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_FULL:
+        atom_line_len = ATOM_FULL_LEN;
+        write_atoms = 1;
+        break;
+    default:
+        atom_line_len = 0;
+        write_atoms = 0;
     }
 
-
     /* bond preparations */
     bond_line_len = write_bonds = 0;
-    if( out_control->bond_info == OPT_BOND_BASIC )
+    if ( out_control->bond_info == OPT_BOND_BASIC )
     {
         bond_line_len = BOND_BASIC_LEN;
         write_bonds = 1;
     }
-    else if( out_control->bond_info == OPT_BOND_FULL )
+    else if ( out_control->bond_info == OPT_BOND_FULL )
     {
         bond_line_len = BOND_FULL_LEN;
         write_bonds = 1;
     }
 
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds);
+    fprintf( stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds );
 #endif
 
     num_bonds = 0;
-    if( write_bonds )
+    if ( write_bonds )
     {
-
 #ifdef __PRINT_CPU_RESULTS__
-        //fprintf (stderr, "Synching bonds from device for printing ....\n");
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Synching bonds from device for printing ....\n" );
+#endif
         Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
 #endif
 
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
                         bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     ++num_bonds;
+                }
+            }
+        }
     }
 
-
     /* angle preparations */
-    if( out_control->angle_info == OPT_ANGLE_BASIC )
+    if ( out_control->angle_info == OPT_ANGLE_BASIC )
     {
         angle_line_len = ANGLE_BASIC_LEN;
         write_angles = 1;
     }
-    else 
+    else
     {
         angle_line_len = 0;
         write_angles = 0;
     }
 
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
+    fprintf( stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
 #endif
 
     num_thb_intrs = 0;
-    if( write_angles ) {
-
+    if ( write_angles )
+    {
 #ifdef __PRINT_CPU_RESULTS__
-        //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Synching three bodies from deivce for printing ... \n" );
+#endif 
         Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
-        if ( !write_bonds) {
-            //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
+        if ( !write_bonds )
+        {
+#ifdef __DEBUG_CUDA__
+            fprintf( stderr, "Synching bonds for three bodies from device for printing ... \n" );
+#endif 
             Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
         }
 #endif 
 
-        for( j = 0; j < system->N; ++j )
-            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+        for ( j = 0; j < system->N; ++j )
+        {
+            for ( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+            {
+                if ( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut )
+                {
                     // physical j&i bond
-                    for( pk = Start_Index( pi, thb_intrs ); 
+                    for ( pk = Start_Index( pi, thb_intrs );
                             pk < End_Index( pi, thb_intrs ); ++pk )
-                        if( bonds->select.bond_list[pi].nbr < 
-                                thb_intrs->select.three_body_list[pk].thb ) {
+                    {
+                        if ( bonds->select.bond_list[pi].nbr <
+                                thb_intrs->select.three_body_list[pk].thb )
+                        {
                             // get k's pointer on j's bond list
                             pk_j = thb_intrs->select.three_body_list[pk].pthb;
 
-                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                            if ( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut )
                                 // physical j&k bond
                                 ++num_thb_intrs;
                         }
+                    }
+                }
+            }
+        }
     }
 
 
-
     /* get correct pressure */
-    if( control->ensemble == NPT || control->ensemble == sNPT )
+    if ( control->ensemble == NPT || control->ensemble == sNPT )
+    {
         P = data->flex_bar.P_scalar;
-    else  if( control->ensemble == iNPT )
+    }
+    else  if ( control->ensemble == iNPT )
+    {
         P = data->iso_bar.P;
-    else P = 0;
-
+    }
+    else
+    {
+        P = 0;
+    }
 
     /* calculate total frame length*/
     sprintf( buffer, FRAME_GLOBALS,
-            data->step, data->time, 
-            data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
-            P, system->box.volume,
-            system->box.box_norms[0], 
-            system->box.box_norms[1], 
-            system->box.box_norms[2],
-            90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
-            data->E_BE,
-            data->E_Ov,  data->E_Un,  data->E_Lp,
-            data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
-            data->E_Tor, data->E_Con, 
-            data->E_vdW, data->E_Ele, data->E_Pol );
+             data->step, data->time,
+             data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
+             P, system->box.volume,
+             system->box.box_norms[0],
+             system->box.box_norms[1],
+             system->box.box_norms[2],
+             90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
+             data->E_BE,
+             data->E_Ov,  data->E_Un,  data->E_Lp,
+             data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
+             data->E_Tor, data->E_Con,
+             data->E_vdW, data->E_Ele, data->E_Pol );
     frame_globals_len = strlen( buffer );
 
-    frame_len = frame_globals_len + 
-        write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
-        write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-        write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+    frame_len = frame_globals_len +
+                write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
+                write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+                write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
 
     /* write size info & frame globals */
-    out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-            frame_len, frame_globals_len );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        frame_len, frame_globals_len );
     out_control->write( out_control->trj, "%s", buffer );
 
 
-    /* write size info & atom lines */  
-    if( write_atoms ) 
+    /* write size info & atom lines */
+    if ( write_atoms )
     {
         rest_of_frame_len = system->N * atom_line_len +
-            write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+                            write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+                            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
-        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-                rest_of_frame_len, system->N * atom_line_len, 
-                system->N );
+        out_control->write( out_control->trj, SIZE_INFO_LINE3,
+                            rest_of_frame_len, system->N * atom_line_len,
+                            system->N );
     }
 
-    switch( out_control->atom_format )
+    switch ( out_control->atom_format )
     {
-        case 4: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_BASIC, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].q );
-            break;
-        case 5:
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_wF, 
-                        workspace->orig_id[i],
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].f[0], 
-                        system->atoms[i].f[1], 
-                        system->atoms[i].f[2],
-                        system->atoms[i].q );
-            break;
-        case 6: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_wV, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].v[0], 
-                        system->atoms[i].v[1], 
-                        system->atoms[i].v[2],
-                        system->atoms[i].q );
-            break;
-        case 7: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_FULL, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].v[0], 
-                        system->atoms[i].v[1], 
-                        system->atoms[i].v[2],
-                        system->atoms[i].f[0], 
-                        system->atoms[i].f[1], 
-                        system->atoms[i].f[2],
-                        system->atoms[i].q );
-            break;
+    case 4:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_BASIC,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].q );
+        break;
+    case 5:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_wF,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].f[0],
+                                system->atoms[i].f[1],
+                                system->atoms[i].f[2],
+                                system->atoms[i].q );
+        break;
+    case 6:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_wV,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].v[0],
+                                system->atoms[i].v[1],
+                                system->atoms[i].v[2],
+                                system->atoms[i].q );
+        break;
+    case 7:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_FULL,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].v[0],
+                                system->atoms[i].v[1],
+                                system->atoms[i].v[2],
+                                system->atoms[i].f[0],
+                                system->atoms[i].f[1],
+                                system->atoms[i].f[2],
+                                system->atoms[i].q );
+        break;
     }
     fflush( out_control->trj );
 
 
     /* write size info & bond lines */
-    if( write_bonds )
+    if ( write_bonds )
     {
         rest_of_frame_len = num_bonds * bond_line_len +
-            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+                            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
-        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-                rest_of_frame_len, num_bonds * bond_line_len, 
-                num_bonds );
+        out_control->write( out_control->trj, SIZE_INFO_LINE3,
+                            rest_of_frame_len, num_bonds * bond_line_len,
+                            num_bonds );
     }
 
-    if( out_control->bond_info == 1 ) {
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
-                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+    if ( out_control->bond_info == 1 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     bo_ij = &( bonds->select.bond_list[j] );
-                    out_control->write( out_control->trj, BOND_BASIC, 
-                            workspace->orig_id[i], 
-                            workspace->orig_id[bo_ij->nbr], 
-                            bo_ij->d, bo_ij->bo_data.BO );
+                    out_control->write( out_control->trj, BOND_BASIC,
+                                        workspace->orig_id[i],
+                                        workspace->orig_id[bo_ij->nbr],
+                                        bo_ij->d, bo_ij->bo_data.BO );
                 }
+            }
+        }
     }
-    else if( out_control->bond_info == 2 ) {
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
-                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+    else if ( out_control->bond_info == 2 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     bo_ij = &( bonds->select.bond_list[j] );
-                    out_control->write( out_control->trj, BOND_FULL, 
-                            workspace->orig_id[i], 
-                            workspace->orig_id[bo_ij->nbr], 
-                            bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, 
-                            bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
+                    out_control->write( out_control->trj, BOND_FULL,
+                                        workspace->orig_id[i],
+                                        workspace->orig_id[bo_ij->nbr],
+                                        bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s,
+                                        bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
                 }
+            }
+        }
     }
 
     fflush( out_control->trj );
 
 
     /* write size info & angle lines */
-    if( out_control->angle_info ) {
+    if ( out_control->angle_info )
+    {
         out_control->write( out_control->trj, SIZE_INFO_LINE3,
-                num_thb_intrs * angle_line_len, 
-                num_thb_intrs * angle_line_len, num_thb_intrs );
+                            num_thb_intrs * angle_line_len,
+                            num_thb_intrs * angle_line_len, num_thb_intrs );
 
-        for( j = 0; j < system->N; ++j )
-            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+        for ( j = 0; j < system->N; ++j )
+        {
+            for ( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+            {
+                if ( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut )
+                {
                     // physical j&i bond
-                    for( pk = Start_Index( pi, thb_intrs ); 
+                    for ( pk = Start_Index( pi, thb_intrs );
                             pk < End_Index( pi, thb_intrs ); ++pk )
-                        if( bonds->select.bond_list[pi].nbr < 
-                                thb_intrs->select.three_body_list[pk].thb ) {
-                            pk_j = thb_intrs->select.three_body_list[pk].pthb; 
+                    {
+                        if ( bonds->select.bond_list[pi].nbr <
+                                thb_intrs->select.three_body_list[pk].thb )
+                        {
+                            pk_j = thb_intrs->select.three_body_list[pk].pthb;
                             // get k's pointer on j's bond list
 
-                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                            if ( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut )
+                            {
                                 // physical j&k bond
                                 out_control->write( out_control->trj, ANGLE_BASIC,
-                                        workspace->orig_id[bonds->select.bond_list[pi].nbr], 
-                                        workspace->orig_id[j], 
-                                        workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], 
-                                        RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
+                                                    workspace->orig_id[bonds->select.bond_list[pi].nbr],
+                                                    workspace->orig_id[j],
+                                                    workspace->orig_id[thb_intrs->select.three_body_list[pk].thb],
+                                                    RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
+                            }
                         }
+                    }
+                }
+            }
+        }
     }
 
     fflush( out_control->trj );
@@ -445,45 +505,47 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     return 0;
 }
 
-/*
-   void Read_Traj( output_controls *out_control, char *traj_name )
-   {
-   int skip_all, skip_part, n;
-   char size_buffer[50];
-// char read_buffer[2048];
 
-out_control->trj = (FILE *)gzopen( traj_name, "r" );
+void Read_Traj( output_controls *out_control, char *traj_name )
+{
+    int skip_all, skip_part, n;
+    char size_buffer[50];
 
-fprintf( stderr, "file opened!\n" );
+    out_control->trj = gzopen( traj_name, "r" );
 
-while( !gzeof( out_control->trj ) )
-{
-if( gzgets( out_control->trj, size_buffer, 50 ) == Z_NULL )
-break;
+    fprintf( stderr, "file opened!\n" );
+
+    while ( !gzeof( out_control->trj ) )
+    {
+        if ( gzgets( out_control->trj, size_buffer, 50 ) == Z_NULL )
+        {
+            break;
+        }
 
-fprintf( stderr, "read line\n" );
+        fprintf( stderr, "read line\n" );
 
-if( strlen( size_buffer ) >= SIZE_INFO_LEN3 )
-sscanf( size_buffer, "%d %d %d", &skip_all, &skip_part, &n );
-else
-sscanf( size_buffer, "%d %d", &skip_all, &skip_part );
+        if ( strlen( size_buffer ) >= SIZE_INFO_LEN3 )
+        {
+            sscanf( size_buffer, "%d %d %d", &skip_all, &skip_part, &n );
+        }
+        else
+        {
+            sscanf( size_buffer, "%d %d", &skip_all, &skip_part );
+        }
 
-fprintf( stderr, "%d %d\n", skip_all, skip_part );
+        fprintf( stderr, "%d %d\n", skip_all, skip_part );
 
-gzseek( out_control->trj, skip_part, SEEK_CUR );
-}
+        gzseek( out_control->trj, skip_part, SEEK_CUR );
+    }
 
-gzclose( out_control->trj );
+    gzclose( out_control->trj );
 }
- */
-
 
 
 /********************************************************/
 /************      XYZ FORMAT ROUTINES    ***************/
 /********************************************************/
-
-int Write_xyz_Header( reax_system *system, control_params *control, 
+int Write_xyz_Header( reax_system *system, control_params *control,
         static_storage* workspace, output_controls *out_control )
 {
     fflush( out_control->trj );
@@ -492,25 +554,27 @@ int Write_xyz_Header( reax_system *system, control_params *control,
 }
 
 
-int Append_xyz_Frame( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+int Append_xyz_Frame( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i;
 
     out_control->write( out_control->trj, "%d\n", system->N );
 
     out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n",
-            data->step,
-            data->E_Tot, data->E_Pot, 
-            E_CONV*data->E_Kin, data->therm.T );
+                        data->step,
+                        data->E_Tot, data->E_Pot,
+                        E_CONV * data->E_Kin, data->therm.T );
 
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
+    {
         out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n",
-                system->reaxprm.sbp[ system->atoms[i].type ].name,
-                system->atoms[i].x[0], 
-                system->atoms[i].x[1], 
-                system->atoms[i].x[2] );
+                            system->reaxprm.sbp[ system->atoms[i].type ].name,
+                            system->atoms[i].x[0],
+                            system->atoms[i].x[1],
+                            system->atoms[i].x[2] );
+    }
 
     fflush( out_control->trj );
 
diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h
index 35d92602eee7c2d0b5ee83889623df2cb2106c71..200f67711e60285f67f32ecb238f81e95d3f9b0d 100644
--- a/PuReMD-GPU/src/traj.h
+++ b/PuReMD-GPU/src/traj.h
@@ -25,6 +25,7 @@
 
 #include <zlib.h>
 
+
 #define BLOCK_MARK "REAX_BLOCK_MARK "
 #define BLOCK_MARK_LEN 16
 
@@ -74,11 +75,27 @@
 #define SIZE_INFO_LEN3 33
 
 
-enum ATOM_LINE_OPTS {OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5,
-                     OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7
-                    };
-enum BOND_LINE_OPTS {OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL};
-enum ANGLE_LINE_OPTS {OPT_NOANGLE, OPT_ANGLE_BASIC};
+enum ATOM_LINE_OPTS
+{
+    OPT_NOATOM = 0,
+    OPT_ATOM_BASIC = 4,
+    OPT_ATOM_wF = 5,
+    OPT_ATOM_wV = 6,
+    OPT_ATOM_FULL = 7,
+};
+
+enum BOND_LINE_OPTS
+{
+    OPT_NOBOND = 0,
+    OPT_BOND_BASIC = 1,
+    OPT_BOND_FULL = 2,
+};
+
+enum ANGLE_LINE_OPTS
+{
+    OPT_NOANGLE = 0,
+    OPT_ANGLE_BASIC = 1,
+};
 
 
 struct
@@ -143,10 +160,8 @@ int Skip_Next_Block( gzFile, int*);
   No. of torsion entries (int)
   Torsion info lines as per torsion format.
 */
-int Write_Custom_Header( reax_system*, control_params*,
-                         static_storage*, output_controls* );
-int Write_xyz_Header   ( reax_system*, control_params*,
-                         static_storage*, output_controls* );
+int Write_Custom_Header( reax_system*, control_params*, static_storage*, output_controls* );
+int Write_xyz_Header   ( reax_system*, control_params*, static_storage*, output_controls* );
 
 /*
   Write_Traj_Header( gzfile file,
@@ -168,7 +183,7 @@ char Write_Traj_Header( FILE*, int, char**, char**, control_params* );
           char** various flags);
 */
 int Push_Traj_Frame( /*gzfile*/ FILE*, reax_system*, control_params*,
-                                simulation_data*, static_storage*, list**, char** );
+        simulation_data*, static_storage*, list**, char** );
 
 /*
   Append_Traj_Frame( gzfile file,
@@ -180,11 +195,11 @@ int Push_Traj_Frame( /*gzfile*/ FILE*, reax_system*, control_params*,
                 char** various flags);
 */
 int Append_Custom_Frame( reax_system*, control_params*, simulation_data*,
-                         static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 int Append_xyz_Frame   ( reax_system*, control_params*, simulation_data*,
-                         static_storage*, list**, output_controls* );
-
+        static_storage*, list**, output_controls* );
 
 void Read_Traj( output_controls*, char * );
 
+
 #endif
diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c
index 2e7a6daf9039ea26c22b2fcfda5913e46255ad75..d5b53a05e607d043ba5f59f96f51a1065438c1d4 100644
--- a/PuReMD-GPU/src/two_body_interactions.c
+++ b/PuReMD-GPU/src/two_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,14 +22,14 @@
 #include "two_body_interactions.h"
 
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Bond_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
+void Bond_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
     int i, j, pj;
@@ -50,12 +51,14 @@ void Bond_Energy( reax_system *system, control_params *control,
     gp10 = system->reaxprm.gp.l[10];
     gp37 = (int) system->reaxprm.gp.l[37];
 
-    for( i=0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         start_i = Start_Index(i, bonds);
         end_i = End_Index(i, bonds);
         //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-        for( pj = start_i; pj < end_i; ++pj )
-            if( i < bonds->select.bond_list[pj].nbr ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+            if ( i < bonds->select.bond_list[pj].nbr )
+            {
                 /* set the pointers */
                 j = bonds->select.bond_list[pj].nbr;
                 type_i = system->atoms[i].type;
@@ -68,15 +71,12 @@ void Bond_Energy( reax_system *system, control_params *control,
                 /* calculate the constants */
                 pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
                 exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-                CEbo = -twbp->De_s * exp_be12 * 
-                    ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+                CEbo = -twbp->De_s * exp_be12 *
+                       ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
 
                 /* calculate the Bond Energy */
-                ebond = 
-                    -twbp->De_s * bo_ij->BO_s * exp_be12 
-                    -twbp->De_p * bo_ij->BO_pi 
-                    -twbp->De_pp * bo_ij->BO_pi2;
-
+                ebond = -twbp->De_s * bo_ij->BO_s * exp_be12
+                    - twbp->De_p * bo_ij->BO_pi - twbp->De_pp * bo_ij->BO_pi2;
                 data->E_BE += ebond;
 
                 /* calculate derivatives of Bond Orders */
@@ -85,34 +85,36 @@ void Bond_Energy( reax_system *system, control_params *control,
                 bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-                        workspace->orig_id[i], workspace->orig_id[j], 
-                        // i+1, j+1, 
-                        bo_ij->BO, ebond/*, data->E_BE*/ );
-                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-                   workspace->orig_id[i], workspace->orig_id[j], 
+                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n",
+                         workspace->orig_id[i], workspace->orig_id[j],
+                         // i+1, j+1,
+                         bo_ij->BO, ebond/*, data->E_BE*/ );
+                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n",
+                   workspace->orig_id[i], workspace->orig_id[j],
                    CEbo, -twbp->De_p, -twbp->De_pp );*/
 #endif
 #ifdef TEST_FORCES
                 Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-                Add_dBOpinpi2( system, lists, i, pj, 
-                        -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-                        workspace->f_be, workspace->f_be );
+                Add_dBOpinpi2( system, lists, i, pj,
+                               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp),
+                               workspace->f_be, workspace->f_be );
 #endif
 
                 /* Stabilisation terminal triple bond */
-                if( bo_ij->BO >= 1.00 ) {
-                    if( gp37 == 2 ||
-                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                if ( bo_ij->BO >= 1.00 )
+                {
+                    if ( gp37 == 2 ||
+                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) ||
+                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) )
+                    {
                         // ba = SQR(bo_ij->BO - 2.50);
                         exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
                         //oboa=abo(j1)-boa;
                         //obob=abo(j2)-boa;
-                        exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-                        exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                        exphua1 = EXP(-gp3 * (workspace->total_bond_order[i] - bo_ij->BO));
+                        exphub1 = EXP(-gp3 * (workspace->total_bond_order[j] - bo_ij->BO));
                         //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-                        exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                        exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
                         hulpov = 1.0 / (1.0 + 25.0 * exphuov);
 
                         estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
@@ -120,24 +122,22 @@ void Bond_Energy( reax_system *system, control_params *control,
                         //estrain(j2) = estrain(j2) + 0.50*estriph;
                         data->E_BE += estriph;
 
-                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-                            ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-                        decobdboua = -gp10 * exphu * hulpov * 
-                            (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                        decobdboub = -gp10 * exphu * hulpov * 
-                            (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
+                                   ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.50) );
+                        decobdboua = -gp10 * exphu * hulpov *
+                                     (gp3 * exphua1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
+                        decobdboub = -gp10 * exphu * hulpov *
+                                     (gp3 * exphub1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
 
                         bo_ij->Cdbo += decobdbo;
                         workspace->CdDelta[i] += decobdboua;
                         workspace->CdDelta[j] += decobdboub;
-                        //loop_j ++;
-                        //fprintf (stderr, "incrementing loopj %d \n", loop_j);
 #ifdef TEST_ENERGY
-                        fprintf( out_control->ebond, 
-                                "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                                workspace->orig_id[i], workspace->orig_id[j],
-                                //i+1, j+1, 
-                                estriph, decobdbo, decobdboua, decobdboub );
+                        fprintf( out_control->ebond,
+                                 "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                                 workspace->orig_id[i], workspace->orig_id[j],
+                                 //i+1, j+1,
+                                 estriph, decobdbo, decobdboua, decobdboub );
 #endif
 #ifdef TEST_FORCES
                         Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
@@ -151,9 +151,9 @@ void Bond_Energy( reax_system *system, control_params *control,
 }
 
 
-void vdW_Coulomb_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+void vdW_Coulomb_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int  i, j, pj;
     int  start_i, end_i;
@@ -172,20 +172,22 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     p_vdW1 = system->reaxprm.gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
-    far_nbrs = (*lists) + FAR_NBRS; 
+    far_nbrs = (*lists) + FAR_NBRS;
     e_ele = 0;
     e_vdW = 0;
     e_core = 0;
     de_core = 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
         // fprintf( stderr, "i: %d, start: %d, end: %d\n",
         //     i, start_i, end_i );
 
-        for( pj = start_i; pj < end_i; ++pj )
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+            {
                 nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
                 j = nbr_pj->nbr;
                 r_ij = nbr_pj->d;
@@ -202,15 +204,16 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 Tap = Tap * r_ij + control->Tap1;
                 Tap = Tap * r_ij + control->Tap0;
 
-                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-                dTap = dTap * r_ij + 5*control->Tap5;
-                dTap = dTap * r_ij + 4*control->Tap4;
-                dTap = dTap * r_ij + 3*control->Tap3;
-                dTap = dTap * r_ij + 2*control->Tap2;
-                dTap += control->Tap1/r_ij;
+                dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+                dTap = dTap * r_ij + 5 * control->Tap5;
+                dTap = dTap * r_ij + 4 * control->Tap4;
+                dTap = dTap * r_ij + 3 * control->Tap3;
+                dTap = dTap * r_ij + 2 * control->Tap2;
+                dTap += control->Tap1 / r_ij;
 
                 /*vdWaals Calculations*/
-                if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
+                if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+                {
                     // shielding
                     powr_vdW1 = POW(r_ij, p_vdW1);
                     powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
@@ -219,35 +222,37 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                     exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
                     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    data->E_vdW += e_vdW =
+                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                        POW(r_ij, p_vdW1 - 2.0);
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                            POW(r_ij, p_vdW1 - 2.0);
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) * dfn13 );
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) -
+                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                         (exp1 - exp2) * dfn13 );
                 }
-                else{ // no shielding
+                else  // no shielding
+                {
                     exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
                     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    data->E_vdW += e_vdW =
+                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) );
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                         (exp1 - exp2) );
                 }
 
-                if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
+                if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
+                {
                     // innner wall
-                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
                     e_vdW += self_coef * Tap * e_core;
                     data->E_vdW += self_coef * Tap * e_core;
 
-                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    de_core = -(twbp->acore / twbp->rcore) * e_core;
                     CEvd += self_coef * ( dTap * e_core + Tap * de_core );
                 }
 
@@ -257,24 +262,26 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
                 tmp = Tap / dr3gamij_3;
                 //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-                data->E_Ele += e_ele = 
-                    self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+                data->E_Ele += e_ele =
+                                   self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
 
 
                 CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
-                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                         ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q*
                   ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
 
 
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    rvec_ScaledAdd( system->atoms[i].f, 
-                            -(CEvd+CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f, 
-                            +(CEvd+CEclmb), nbr_pj->dvec );
+                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                {
+                    rvec_ScaledAdd( system->atoms[i].f,
+                                    -(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f,
+                                    +(CEvd + CEclmb), nbr_pj->dvec );
                 }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
+                else   // NPT, iNPT or sNPT
+                {
+                    /* for pressure coupling, terms not related to bond order
                        derivatives are added directly into pressure vector/tensor */
                     rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
 
@@ -284,47 +291,47 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                     rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
                     rvec_Add( data->ext_press, ext_press );
 
-                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
+                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)",
                       i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
 
                       fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
 
-                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",        
+                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",
                       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
-                    /* This part is intended for a fully-flexible box */          
-                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
+                    /* This part is intended for a fully-flexible box */
+                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec,
                        system->atoms[i].x );
-                       rtensor_Scale( total_rtensor, 
+                       rtensor_Scale( total_rtensor,
                        F_C * -(CEvd + CEclmb), temp_rtensor );
-                       rvec_OuterProduct( temp_rtensor, 
+                       rvec_OuterProduct( temp_rtensor,
                        nbr_pj->dvec, system->atoms[j].x );
-                       rtensor_ScaledAdd( total_rtensor, 
+                       rtensor_ScaledAdd( total_rtensor,
                        F_C * +(CEvd + CEclmb), temp_rtensor );
 
                        if( nbr_pj->imaginary )
-                    // This is an external force due to an imaginary nbr
-                    rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                    else
-                    // This interaction is completely internal
-                    rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                       // This is an external force due to an imaginary nbr
+                       rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                       else
+                       // This interaction is completely internal
+                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
                 }
 
 #ifdef TEST_ENERGY
                 rvec_MakeZero( temp );
                 rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
                 fprintf( out_control->evdw,
-                        "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                        //i+1, j+1,
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
+                         "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                         //i+1, j+1,
+                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                         r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
 
                 fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
-                        e_ele/*, data->E_Ele*/ );
+                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                         r_ij, system->atoms[i].q, system->atoms[j].q,
+                         e_ele/*, data->E_Ele*/ );
 #endif
 #ifdef TEST_FORCES
                 rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
@@ -337,13 +344,13 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     // fclose( fout );
 
-    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
+    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n",
     // data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 }
 
 
-void LR_vdW_Coulomb( reax_system *system, control_params *control, 
-        int i, int j, real r_ij, LR_data *lr )
+void LR_vdW_Coulomb( reax_system *system, control_params *control,
+                     int i, int j, real r_ij, LR_data *lr )
 {
     real p_vdW1 = system->reaxprm.gp.l[28];
     real p_vdW1i = 1.0 / p_vdW1;
@@ -367,12 +374,12 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
     Tap = Tap * r_ij + control->Tap1;
     Tap = Tap * r_ij + control->Tap0;
 
-    dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-    dTap = dTap * r_ij + 5*control->Tap5;
-    dTap = dTap * r_ij + 4*control->Tap4;
-    dTap = dTap * r_ij + 3*control->Tap3;
-    dTap = dTap * r_ij + 2*control->Tap2;
-    dTap += control->Tap1/r_ij;
+    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+    dTap = dTap * r_ij + 5 * control->Tap5;
+    dTap = dTap * r_ij + 4 * control->Tap4;
+    dTap = dTap * r_ij + 3 * control->Tap3;
+    dTap = dTap * r_ij + 2 * control->Tap2;
+    dTap += control->Tap1 / r_ij;
 
 
     /* vdWaals calculations */
@@ -383,20 +390,21 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
     exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
     /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
-Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
-Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), 
-powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
+       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
+       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
+       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
 
     dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
 
-    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
-        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
+               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
 
     /*vdWaals Calculations*/
-    if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
-    { // shielding
+    if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+    {
+        // shielding
         powr_vdW1 = POW(r_ij, p_vdW1);
         powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
 
@@ -404,30 +412,32 @@ powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
         exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
         exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-            POW(r_ij, p_vdW1 - 2.0);
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                POW(r_ij, p_vdW1 - 2.0);
 
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
     }
-    else{ // no shielding
+    else  // no shielding
+    {
         exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
         exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
         lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
     }
 
-    if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
-    { // innner wall
-        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+    if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
+    {
+        // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
         lr->e_vdW += Tap * e_core;
 
-        de_core = -(twbp->acore/twbp->rcore) * e_core;
+        de_core = -(twbp->acore / twbp->rcore) * e_core;
         lr->CEvd += dTap * e_core + Tap * de_core;
     }
 
@@ -439,10 +449,10 @@ powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
     lr->H = EV_to_KCALpMOL * tmp;
     lr->e_ele = C_ele * tmp;
     /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
-Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
-i, system->atoms[i].type, j, system->atoms[j].type, 
-twbp->gamma, Tap, dr3gamij_3, 
-system->atoms[i].q, system->atoms[j].q ); */
+       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
+       i, system->atoms[i].type, j, system->atoms[j].type,
+       twbp->gamma, Tap, dr3gamij_3,
+       system->atoms[i].q, system->atoms[j].q ); */
 
     lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
     /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
@@ -454,10 +464,9 @@ system->atoms[i].q, system->atoms[j].q ); */
 }
 
 
-void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
-        simulation_data *data, 
-        static_storage *workspace, list **lists, 
-        output_controls *out_control )
+void Tabulated_vdW_Coulomb_Energy( reax_system *system,
+        control_params *control, simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i, j, pj, r, steps, update_freq, update_energies;
     int type_i, type_j, tmin, tmax;
@@ -474,13 +483,16 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
     update_freq = out_control->energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         type_i  = system->atoms[i].type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
 
-        for( pj = start_i; pj < end_i; ++pj ) 
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+            {
                 nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
                 j      = nbr_pj->nbr;
                 type_j = system->atoms[j].type;
@@ -488,43 +500,46 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 self_coef = (i == j) ? 0.5 : 1.0;
                 tmin  = MIN( type_i, type_j );
                 tmax  = MAX( type_i, type_j );
-                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
+                t = &( LR[ index_lr(tmin,tmax,system->reaxprm.num_atom_types) ] ); 
 
                 /* Cubic Spline Interpolation */
                 r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
+                if ( r == 0 )  ++r;
+                base = (real)(r + 1) * t->dx;
                 dif = r_ij - base;
                 //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
 
-                if( update_energies ) {
-                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                        t->vdW[r].a;
+                if ( update_energies )
+                {
+                    e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) * dif +
+                            t->vdW[r].a;
                     e_vdW *= self_coef;
 
-                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                        t->ele[r].a;
+                    e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                            t->ele[r].a;
                     e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
 
                     data->E_vdW += e_vdW;
                     data->E_Ele += e_ele;
-                }    
+                }
 
-                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                    t->CEvd[r].a;
+                CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif +
+                       t->CEvd[r].a;
                 CEvd *= self_coef;
                 //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
 
-                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                    t->CEclmb[r].a;
+                CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif +
+                         t->CEclmb[r].a;
                 CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
 
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                {
                     rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
                     rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
                 }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
+                else   // NPT, iNPT or sNPT
+                {
+                    /* for pressure coupling, terms not related to bond order
                        derivatives are added directly into pressure vector/tensor */
                     rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
                     rvec_ScaledAdd( system->atoms[i].f, -1., temp );
@@ -535,11 +550,11 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
 #ifdef TEST_ENERGY
                 fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j], 
+                        workspace->orig_id[i], workspace->orig_id[j],
                         r_ij, e_vdW, data->E_vdW );
-                fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                fprintf(out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
                         workspace->orig_id[i], workspace->orig_id[j],
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        r_ij, system->atoms[i].q, system->atoms[j].q,
                         e_ele, data->E_Ele );
 #endif
 #ifdef TEST_FORCES
@@ -549,23 +564,24 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
             }
+        }
     }
 }
 
 
 #if defined(OLD)
-    /* Linear extrapolation */
-    /*p     = (r_ij * t->inv_dx;
-      r     = (int) p;
-      prev  = &( t->y[r] );
-      next  = &( t->y[r+1] );
-
-      tmp    = p - r;
-      e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
-      CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
-
-      e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
-      e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
-      CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
-      CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
+/* Linear extrapolation */
+/*p     = (r_ij * t->inv_dx;
+  r     = (int) p;
+  prev  = &( t->y[r] );
+  next  = &( t->y[r+1] );
+
+  tmp    = p - r;
+  e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
+  CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
+
+  e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
+  e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
+  CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
+  CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
 #endif
diff --git a/PuReMD-GPU/src/vector.c b/PuReMD-GPU/src/vector.c
index 7cf06eb8e6cb1560b651b8b16a091f3a387cdb6c..e396344d173a6d5343faf9d675f48a8ea4e0ca04 100644
--- a/PuReMD-GPU/src/vector.c
+++ b/PuReMD-GPU/src/vector.c
@@ -21,53 +21,90 @@
 #include "vector.h"
 
 
-int Vector_isZero( real* v, int k )
+inline int Vector_isZero( const real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        if( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
+    unsigned int i;
 
-    return 1;
+    #pragma omp master
+    {
+        ret = TRUE;
+    }
+
+    #pragma omp barrier
+
+    #pragma omp for reduction(&&: ret) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        if ( FABS( v[i] ) > ALMOST_ZERO )
+        {
+            ret = FALSE;
+        }
+    }
+
+    return ret;
 }
 
 
-void Vector_MakeZero( real *v, int k )
+inline void Vector_MakeZero( real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        v[k] = 0;
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        v[i] = ZERO;
+    }
 }
 
 
-void Vector_Copy( real* dest, real* v, int k )
+inline void Vector_Copy( real * const dest, const real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        dest[k] = v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = v[i];
+    }
 }
 
 
-void Vector_Print( FILE *fout, char *vname, real *v, int k )
+void Vector_Print( FILE * const fout, const char * const vname, const real * const v,
+                   const unsigned int k )
 {
-    int i;
+    unsigned int i;
 
     fprintf( fout, "%s:\n", vname );
-    for( i = 0; i < k; ++i )
+    for ( i = 0; i < k; ++i )
+    {
         fprintf( fout, "%24.15e\n", v[i] );
+    }
     fprintf( fout, "\n" );
 }
 
 
-real Norm( real* v1, int k )
+inline real Norm( const real * const v1, const unsigned int k )
 {
-    real ret = 0;
+    unsigned int i;
+
+    #pragma omp master
+    {
+        ret2 = ZERO;
+    }
 
-    for( --k; k>=0; --k )
-        ret +=  SQR( v1[k] );
+    #pragma omp barrier
+
+    #pragma omp for reduction(+: ret2) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        ret2 +=  SQR( v1[i] );
+    }
 
-    return SQRT( ret );
+    return SQRT( ret2 );
 }
 
 
-void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
+inline void rvec_Sum( rvec ret, const rvec v1 , const rvec v2 )
 {
     ret[0] = v1[0] + v2[0];
     ret[1] = v1[1] + v2[1];
@@ -75,13 +112,14 @@ void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
 }
 
 
-real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
+inline real rvec_ScaledDot( const real c1, const rvec v1,
+        const real c2, const rvec v2 )
 {
-    return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
+    return (c1 * c2) * (v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]);
 }
 
 
-void rvec_Multiply( rvec r, rvec v1, rvec v2 )
+inline void rvec_Multiply( rvec r, const rvec v1, const rvec v2 )
 {
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
@@ -89,7 +127,7 @@ void rvec_Multiply( rvec r, rvec v1, rvec v2 )
 }
 
 
-void rvec_Divide( rvec r, rvec v1, rvec v2 )
+inline void rvec_Divide( rvec r, const rvec v1, const rvec v2 )
 {
     r[0] = v1[0] / v2[0];
     r[1] = v1[1] / v2[1];
@@ -97,7 +135,7 @@ void rvec_Divide( rvec r, rvec v1, rvec v2 )
 }
 
 
-void rvec_iDivide( rvec r, rvec v1, ivec v2 )
+inline void rvec_iDivide( rvec r, const rvec v1, const ivec v2 )
 {
     r[0] = v1[0] / v2[0];
     r[1] = v1[1] / v2[1];
@@ -105,7 +143,7 @@ void rvec_iDivide( rvec r, rvec v1, ivec v2 )
 }
 
 
-void rvec_Invert( rvec r, rvec v )
+inline void rvec_Invert( rvec r, const rvec v )
 {
     r[0] = 1. / v[0];
     r[1] = 1. / v[1];
@@ -113,154 +151,189 @@ void rvec_Invert( rvec r, rvec v )
 }
 
 
-void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
+inline void rvec_OuterProduct( rtensor r, const rvec v1, const rvec v2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             r[i][j] = v1[i] * v2[j];
+        }
+    }
 }
 
 
-
-int rvec_isZero( rvec v )
+inline int rvec_isZero( const rvec v )
 {
-    if( fabs(v[0]) > ALMOST_ZERO || 
-            fabs(v[1]) > ALMOST_ZERO || 
+    if ( fabs(v[0]) > ALMOST_ZERO ||
+            fabs(v[1]) > ALMOST_ZERO ||
             fabs(v[2]) > ALMOST_ZERO )
-        return 0;
-    return 1;
+    {
+        return FALSE;
+    }
+    return TRUE;
 }
 
 
-void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
+inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
 {
-    int i, j, k;
+    unsigned int i, j, k;
     rtensor temp;
 
     // check if the result matrix is the same as one of m1, m2.
-    // if so, we cannot modify the contents of m1 or m2, so 
+    // if so, we cannot modify the contents of m1 or m2, so
     // we have to use a temp matrix.
-    if( ret == m1 || ret == m2 )
+    if ( ret == m1 || ret == m2 )
     {
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
             {
-                temp[i][j] = 0;        
-                for( k = 0; k < 3; ++k )
+                temp[i][j] = 0;
+                for ( k = 0; k < 3; ++k )
                     temp[i][j] += m1[i][k] * m2[k][j];
             }
 
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = temp[i][j];    
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
+                ret[i][j] = temp[i][j];
     }
     else
     {
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
             {
-                ret[i][j] = 0;        
-                for( k = 0; k < 3; ++k )
+                ret[i][j] = 0;
+                for ( k = 0; k < 3; ++k )
                     ret[i][j] += m1[i][k] * m2[k][j];
             }
     }
 }
 
 
-void rtensor_MatVec( rvec ret, rtensor m, rvec v )
+inline void rtensor_MatVec( rvec ret, rtensor m, const rvec v )
 {
-    int i;
+    unsigned int i;
     rvec temp;
 
-    // if ret is the same vector as v, we cannot modify the 
+    // if ret is the same vector as v, we cannot modify the
     // contents of v until all computation is finished.
-    if( ret == v )
+    if ( ret == v )
     {
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
 
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             ret[i] = temp[i];
+        }
     }
     else
     {
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
     }
 }
 
 
-void rtensor_Scale( rtensor ret, real c, rtensor m )
+inline void rtensor_Scale( rtensor ret, const real c, rtensor m )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c * m[i][j];
+        }
+    }
 }
 
 
-void rtensor_Add( rtensor ret, rtensor t )
+inline void rtensor_Add( rtensor ret, rtensor t )
 {
     int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += t[i][j];
+        }
+    }
 }
 
 
-void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
+inline void rtensor_ScaledAdd( rtensor ret, const real c, rtensor t )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += c * t[i][j];
+        }
+    }
 }
 
 
-void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
+inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t1[i][j] + t2[i][j];
+        }
+    }
 }
 
 
-void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
-        real c2, rtensor t2 )
+inline void rtensor_ScaledSum( rtensor ret, const real c1, rtensor t1,
+                               const real c2, rtensor t2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
+        }
+    }
 }
 
 
-void rtensor_Copy( rtensor ret, rtensor t )
+inline void rtensor_Copy( rtensor ret, rtensor t )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t[i][j];
+        }
+    }
 }
 
 
-void rtensor_Identity( rtensor t )
+inline void rtensor_Identity( rtensor t )
 {
     t[0][0] = t[1][1] = t[2][2] = 1;
     t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO;
 }
 
 
-void rtensor_MakeZero( rtensor t )
+inline void rtensor_MakeZero( rtensor t )
 {
     t[0][0] = t[0][1] = t[0][2] = ZERO;
     t[1][0] = t[1][1] = t[1][2] = ZERO;
@@ -268,50 +341,58 @@ void rtensor_MakeZero( rtensor t )
 }
 
 
-void rtensor_Transpose( rtensor ret, rtensor t )
+inline void rtensor_Transpose( rtensor ret, rtensor t )
 {
-    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    ret[0][0] = t[0][0];
+    ret[1][1] = t[1][1];
+    ret[2][2] = t[2][2];
+
+    ret[0][1] = t[1][0];
+    ret[0][2] = t[2][0];
+
+    ret[1][0] = t[0][1];
+    ret[1][2] = t[2][1];
+
+    ret[2][0] = t[0][2];
+    ret[2][1] = t[1][2];
 }
 
 
-real rtensor_Det( rtensor t )
+inline real rtensor_Det( rtensor t )
 {
     return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }
 
 
-real rtensor_Trace( rtensor t )
+inline real rtensor_Trace( rtensor t )
 {
     return (t[0][0] + t[1][1] + t[2][2]);
 }
 
 
-void Print_rTensor(FILE* fp, rtensor t)
+void Print_rTensor(FILE * const fp, rtensor t)
 {
-    int i, j;
+    unsigned int i, j;
 
-    for (i=0; i < 3; i++)
+    for (i = 0; i < 3; i++)
     {
-        fprintf(fp,"[");
-        for (j=0; j < 3; j++)
-            fprintf(fp,"%8.3f,\t",t[i][j]);
-        fprintf(fp,"]\n");
+        fprintf(fp, "[");
+        for (j = 0; j < 3; j++)
+            fprintf(fp, "%8.3f,\t", t[i][j]);
+        fprintf(fp, "]\n");
     }
 }
 
 
-void ivec_MakeZero( ivec v )
+inline void ivec_MakeZero( ivec v )
 {
     v[0] = v[1] = v[2] = 0;
 }
 
 
-void ivec_rScale( ivec dest, real C, rvec src )
+inline void ivec_rScale( ivec dest, const real C, const rvec src )
 {
     dest[0] = (int)(C * src[0]);
     dest[1] = (int)(C * src[1]);
@@ -319,20 +400,22 @@ void ivec_rScale( ivec dest, real C, rvec src )
 }
 
 
-int ivec_isZero( ivec v )
+inline int ivec_isZero( const ivec v )
 {
-    if( v[0]==0 && v[1]==0 && v[2]==0 )
-        return 1;
-    return 0;
+    if ( v[0] == 0 && v[1] == 0 && v[2] == 0 )
+    {
+        return TRUE;
+    }
+    return FALSE;
 }
 
 
-int ivec_isEqual( ivec v1, ivec v2 )
+inline int ivec_isEqual( const ivec v1, const ivec v2 )
 {
-    if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
-        return 1;
+    if ( v1[0] == v2[0] && v1[1] == v2[1] && v1[2] == v2[2] )
+    {
+        return TRUE;
+    }
 
-    return 0;
+    return FALSE;
 }
-
-
diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h
index e1111e514928e79fc79197a0f2486d5eefb1cfa3..79748544fb8349bb797efe246e3430561fefef14 100644
--- a/PuReMD-GPU/src/vector.h
+++ b/PuReMD-GPU/src/vector.h
@@ -26,72 +26,85 @@
 #include "random.h"
 
 
+/* global to make OpenMP shared (Vector_isZero) */
+unsigned int ret;
+/* global to make OpenMP shared (Dot, Norm) */
+real ret2;
+
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-int  Vector_isZero( real*, int );
-void Vector_MakeZero( real*, int );
-void Vector_Copy( real*, real*, int );
-//void Vector_Scale( real*, real, real*, int );
-//void Vector_Sum( real*, real, real*, real, real*, int );
-//void Vector_Add( real*, real, real*, int );
-void Vector_Print( FILE*, char*, real*, int );
-real Norm( real*, int );
-
-void rvec_Sum( rvec, rvec, rvec );
-real rvec_ScaledDot( real, rvec, real, rvec );
-void rvec_Multiply( rvec, rvec, rvec );
-void rvec_Divide( rvec, rvec, rvec );
-void rvec_iDivide( rvec, rvec, ivec );
-void rvec_Invert( rvec, rvec );
-void rvec_OuterProduct( rtensor, rvec, rvec );
-int  rvec_isZero( rvec );
+int Vector_isZero( const real * const, const unsigned int );
+void Vector_MakeZero( real * const, const unsigned int );
+void Vector_Copy( real * const, const real * const, const unsigned int );
+void Vector_Print( FILE * const, const char * const, const real * const, const unsigned int );
+real Norm( const real * const, const unsigned int );
+
+void rvec_Sum( rvec, const rvec, const rvec );
+real rvec_ScaledDot( const real, const rvec, const real, const rvec );
+void rvec_Multiply( rvec, const rvec, const rvec );
+void rvec_Divide( rvec, const rvec, const rvec );
+void rvec_iDivide( rvec, const rvec, const ivec );
+void rvec_Invert( rvec, const rvec );
+void rvec_OuterProduct( rtensor, const rvec, const rvec );
+int rvec_isZero( const rvec );
 
 void rtensor_MakeZero( rtensor );
 void rtensor_Multiply( rtensor, rtensor, rtensor );
-void rtensor_MatVec( rvec, rtensor, rvec );
-void rtensor_Scale( rtensor, real, rtensor );
+void rtensor_MatVec( rvec, rtensor, const rvec );
+void rtensor_Scale( rtensor, const real, rtensor );
 void rtensor_Add( rtensor, rtensor );
-void rtensor_ScaledAdd( rtensor, real, rtensor );
+void rtensor_ScaledAdd( rtensor, const real, rtensor );
 void rtensor_Sum( rtensor, rtensor, rtensor );
-void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor );
-void rtensor_Scale( rtensor, real, rtensor );
+void rtensor_ScaledSum( rtensor, const real, rtensor, const real, rtensor );
+void rtensor_Scale( rtensor, const real, rtensor );
 void rtensor_Copy( rtensor, rtensor );
 void rtensor_Identity( rtensor );
 void rtensor_Transpose( rtensor, rtensor );
 real rtensor_Det( rtensor );
 real rtensor_Trace( rtensor );
 
-void Print_rTensor(FILE*, rtensor);
+void Print_rTensor(FILE * const, rtensor);
 
-int  ivec_isZero( ivec );
-int  ivec_isEqual( ivec, ivec );
+int ivec_isZero( const ivec );
+int ivec_isEqual( const ivec, const ivec );
 void ivec_MakeZero( ivec );
-void ivec_rScale( ivec, real, rvec );
+void ivec_rScale( ivec, const real, const rvec );
 
 
-static inline HOST_DEVICE real Dot( real* v1, real* v2, int k )
+static inline HOST_DEVICE real Dot( const real * const v1, const real * const v2, const unsigned int k )
 {
-    real ret = 0;
+    unsigned int i;
+
+    #pragma omp master
+    {
+        ret2 = ZERO;
+    }
+
+    #pragma omp barrier
 
-    for ( --k; k >= 0; --k )
-        ret +=  v1[k] * v2[k];
 
-    return ret;
+    #pragma omp for reduction(+: ret2) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        ret2 += v1[i] * v2[i];
+    }
+
+    return ret2;
 }
 
 
-/////////////////////////////
-//rvec functions
-/////////////////////////////
 static inline HOST_DEVICE void rvec_MakeZero( rvec v )
 {
-    v[0] = v[1] = v[2] = ZERO;
+    v[0] = ZERO;
+    v[1] = ZERO;
+    v[2] = ZERO;
 }
 
 
-static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
+static inline HOST_DEVICE void rvec_Add( rvec ret, const rvec v )
 {
     ret[0] += v[0];
     ret[1] += v[1];
@@ -99,13 +112,15 @@ static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
 }
 
 
-static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src )
+static inline HOST_DEVICE void rvec_Copy( rvec dest, const rvec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
-static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_Cross( rvec ret, const rvec v1, const rvec v2 )
 {
     ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
     ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
@@ -113,13 +128,16 @@ static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
 }
 
 
-static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, const real c, const rvec v )
 {
-    ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
+    ret[0] += c * v[0];
+    ret[1] += c * v[1];
+    ret[2] += c * v[2];
 }
 
 
-static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
+static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, const real c1, const rvec v1,
+        const real c2, const rvec v2 )
 {
     ret[0] = c1 * v1[0] + c2 * v2[0];
     ret[1] = c1 * v1[1] + c2 * v2[1];
@@ -135,25 +153,27 @@ static inline HOST_DEVICE void rvec_Random( rvec v )
 }
 
 
-static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v )
+static inline HOST_DEVICE real rvec_Norm_Sqr( const rvec v )
 {
     return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
 }
 
 
-static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_Scale( rvec ret, const real c, const rvec v )
 {
-    ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
+    ret[0] = c * v[0];
+    ret[1] = c * v[1];
+    ret[2] = c * v[2];
 }
 
 
-static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 )
+static inline HOST_DEVICE real rvec_Dot( const rvec v1, const rvec v2 )
 {
     return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
 }
 
 
-static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_iMultiply( rvec r, const ivec v1, const rvec v2 )
 {
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
@@ -161,22 +181,21 @@ static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
 }
 
 
-static inline HOST_DEVICE real rvec_Norm( rvec v )
+static inline HOST_DEVICE real rvec_Norm( const rvec v )
 {
     return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
 }
 
 
-/////////////////
-//ivec functions
-/////////////////
-static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src )
+static inline HOST_DEVICE void ivec_Copy( ivec dest , const ivec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
-static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
+static inline HOST_DEVICE void ivec_Scale( ivec dest, const real C, const ivec src )
 {
     dest[0] = C * src[0];
     dest[1] = C * src[1];
@@ -184,7 +203,7 @@ static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
 }
 
 
-static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
+static inline HOST_DEVICE void ivec_Sum( ivec dest, const ivec v1, const ivec v2 )
 {
     dest[0] = v1[0] + v2[0];
     dest[1] = v1[1] + v2[1];
@@ -192,27 +211,43 @@ static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 }
 
 
-/////////////////
-//vector functions
-/////////////////
-static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+static inline HOST_DEVICE void Vector_Sum( real * const dest, const real c,
+        const real * const v, const real d, const real * const y,
+        const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] = c * v[k] + d * y[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i] + d * y[i];
+    }
 }
 
 
-static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Scale( real * const dest, const real c,
+        const real * const v, const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] = c * v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i];
+    }
 }
 
 
-static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Add( real * const dest, const real c,
+        const real * const v, const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] += c * v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] += c * v[i];
+    }
 }
 
 #ifdef __cplusplus
diff --git a/README.md b/README.md
index 442146a67a8a5c20732075c6daf7c541aa72e5f6..abdadbab72c5d297de9a8559471a036ee229c001 100644
--- a/README.md
+++ b/README.md
@@ -7,5 +7,5 @@ Files from the [Purdue Reactive Molecular Dynamics](https://www.cs.purdue.edu/pu
 Roughly by target platform
 - [Serial](https://www.cs.purdue.edu/puremd/docs/80859.pdf)
 - [MPI (message passing interface)](https://www.cs.purdue.edu/puremd/docs/Parallel-Reactive-Molecular-Dynamics.pdf)
-- [CUDA (single GPU)](http://dx.doi.org/10.1016/j.jcp.2014.04.035) (single GPU)
+- [CUDA (single GPU)](http://dx.doi.org/10.1016/j.jcp.2014.04.035)
 - [CUDA+MPI (multi-GPU)](https://www.cs.purdue.edu/puremd/docs/pgpuremd.pdf)
diff --git a/configure.ac b/configure.ac
index 2488af52296535e6ec37ca3c3d687e7f905d78a7..659ab4575e6565b17a6624d586145077ee98a47a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -46,23 +46,25 @@ AC_ARG_ENABLE([mpi-gpu],
 			      [enable MPI+CUDA (multi GPU) support @<:@default: no@:>@])],
 	      [pack_mpi_gpu_enabled=${enableval}], [pack_mpi_gpu_enabled=no])
 
-if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes"; then
-	AC_CONFIG_SUBDIRS([sPuReMD])
+if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes" || test "x${pack_gpu_enabled}" = "xyes"; then
+	AC_CONFIG_SUBDIRS([PuReMD-GPU])
 	if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" != "xyes"; then
 		export BUILD_OPENMP="no"
 	else
-		export BUILD_OPENMP="yes"
+		if test "x${pack_gpu_enabled}" = "xyes"; then
+			export BUILD_GPU="yes"
+		else
+			export BUILD_OPENMP="yes"
+		fi
 	fi
 fi
 AM_CONDITIONAL([BUILD_S_OMP], [test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes"])
+AM_CONDITIONAL([BUILD_GPU], [test "x${pack_gpu_enabled}" = "xyes"])
+
 if test "x${pack_mpi_enabled}" = "xyes"; then
 	AC_CONFIG_SUBDIRS([PuReMD])
 fi
 AM_CONDITIONAL([BUILD_MPI], [test "x${pack_mpi_enabled}" = "xyes"])
-if test "x${pack_gpu_enabled}" = "xyes"; then
-	AC_CONFIG_SUBDIRS([PuReMD-GPU])
-fi
-AM_CONDITIONAL([BUILD_GPU], [test "x${pack_gpu_enabled}" = "xyes"])
 if test "x${pack_mpi_not_gpu_enabled}" = "xyes" || test "x${pack_mpi_gpu_enabled}" = "xyes"; then
 	AC_CONFIG_SUBDIRS([PG-PuReMD])
 	if test "x${pack_mpi_not_gpu_enabled}" = "xyes" || test "x${pack_mpi_gpu_enabled}" != "xyes"; then
diff --git a/sPuReMD/Makefile.am b/sPuReMD/Makefile.am
index 7c986471c7759f44ea3cd9aea126c246929f1647..12cffee9894677d37875b1d6dc9576d723447f21 100644
--- a/sPuReMD/Makefile.am
+++ b/sPuReMD/Makefile.am
@@ -4,7 +4,7 @@ bin_PROGRAMS = bin/spuremd
 bin_spuremd_SOURCES = src/ffield.c src/grid.c src/list.c src/lookup.c src/print_utils.c \
 		  src/reset_utils.c src/restart.c src/random.c src/tool_box.c src/traj.c \
 		  src/vector.c src/allocate.c src/analyze.c src/box.c src/system_props.c src/control.c \
-		  src/geo_tools.c src/neighbors.c src/lin_alg.c src/QEq.c src/bond_orders.c \
+		  src/geo_tools.c src/neighbors.c src/lin_alg.c src/qeq.c src/bond_orders.c \
 		  src/single_body_interactions.c src/two_body_interactions.c \
 		  src/three_body_interactions.c src/four_body_interactions.c src/forces.c \
 		  src/integrate.c src/init_md.c src/testmd.c 
@@ -12,7 +12,7 @@ bin_spuremd_SOURCES = src/ffield.c src/grid.c src/list.c src/lookup.c src/print_
 include_HEADERS = src/mytypes.h src/ffield.h src/grid.h src/list.h src/lookup.h src/print_utils.h \
 		  src/reset_utils.h src/restart.h src/random.h src/tool_box.h src/traj.h \
 		  src/vector.h src/allocate.h src/analyze.h src/box.h src/system_props.h src/control.h \
-		  src/geo_tools.h src/neighbors.h src/lin_alg.h src/QEq.h src/bond_orders.h \
+		  src/geo_tools.h src/neighbors.h src/lin_alg.h src/qeq.h src/bond_orders.h \
 		  src/single_body_interactions.h src/two_body_interactions.h \
 		  src/three_body_interactions.h src/four_body_interactions.h src/forces.h \
 		  src/integrate.h src/init_md.h
diff --git a/sPuReMD/src/forces.c b/sPuReMD/src/forces.c
index 9108a8dd4026e001716f707302e40d671d48b838..5a0585685b7af523c9b129d6ec57cd3589675edc 100644
--- a/sPuReMD/src/forces.c
+++ b/sPuReMD/src/forces.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "forces.h"
+
 #include "box.h"
 #include "bond_orders.h"
 #include "single_body_interactions.h"
@@ -29,7 +30,7 @@
 #include "list.h"
 #include "print_utils.h"
 #include "system_props.h"
-#include "QEq.h"
+#include "qeq.h"
 #include "vector.h"
 
 
diff --git a/sPuReMD/src/integrate.c b/sPuReMD/src/integrate.c
index 142863cad18291a04f4f56fc364c73cf8fdabd3b..f9768c49bc89915346645f6134e9e346c845c2d6 100644
--- a/sPuReMD/src/integrate.c
+++ b/sPuReMD/src/integrate.c
@@ -20,13 +20,14 @@
   ----------------------------------------------------------------------*/
 
 #include "integrate.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "forces.h"
 #include "grid.h"
 #include "neighbors.h"
 #include "print_utils.h"
-#include "QEq.h"
+#include "qeq.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
@@ -34,7 +35,6 @@
 #include "list.h"
 
 
-
 void Velocity_Verlet_NVE(reax_system* system, control_params* control,
                          simulation_data *data, static_storage *workspace,
                          list **lists, output_controls *out_control )
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index fe2d644cae630be6414944989b52adae8a6e1d61..e5a468ffb1669e53e638972b82daad9f7fcc07d0 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -19,8 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __GMRES_H_
-#define __GMRES_H_
+#ifndef __LIN_ALG_H_
+#define __LIN_ALG_H_
 
 #include "mytypes.h"
 
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index 69441bb72b7499b663a506dc8c95d40e56473a78..9d741f256799b99dc012f44a30070d3893546782 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -37,7 +37,7 @@
 #include "zlib.h"
 
 #ifdef _OPENMP
-#include <omp.h>
+  #include <omp.h>
 #endif
 
 //#define DEBUG_FOCUS
@@ -51,6 +51,7 @@
 #define TRUE  1
 #define FALSE 0
 
+#define LOG    log
 #define EXP    exp
 #define SQRT   sqrt
 #define POW    pow
@@ -71,10 +72,10 @@
 /* NaN IEEE 754 representation for C99 in math.h
  * Note: function choice must match REAL typedef below */
 #ifdef NAN
-#define IS_NAN_REAL(a) (isnan(a))
+  #define IS_NAN_REAL(a) (isnan(a))
 #else
-#warn "No support for NaN"
-#define NAN_REAL(a) (0)
+  #warn "No support for NaN"
+  #define NAN_REAL(a) (0)
 #endif
 
 #define PI            3.14159265
@@ -948,20 +949,21 @@ typedef struct
 
 
 typedef void (*interaction_function)(reax_system*, control_params*,
-                                     simulation_data*, static_storage*,
-                                     list**, output_controls*);
+        simulation_data*, static_storage*, list**, output_controls*);
+
 interaction_function Interaction_Functions[NO_OF_INTERACTIONS];
 
 typedef void (*evolve_function)(reax_system*, control_params*,
-                                simulation_data*, static_storage*,
-                                list**, output_controls*);
+        simulation_data*, static_storage*,
+        list**, output_controls*);
 
 typedef real (*lookup_function)(real);
+
 lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
 LR_lookup_table **LR;
 
-
 typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int*);
+        control_params*, far_neighbor_data*, int*);
+
+
 #endif
diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/qeq.c
similarity index 99%
rename from sPuReMD/src/QEq.c
rename to sPuReMD/src/qeq.c
index 026a3ae1a84e4912958737161818aea77080da44..be99a1fd42194fc18b467a913f7a74485c125bec 100644
--- a/sPuReMD/src/QEq.c
+++ b/sPuReMD/src/qeq.c
@@ -19,7 +19,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "QEq.h"
+#include "qeq.h"
 
 #include "allocate.h"
 #include "list.h"
diff --git a/sPuReMD/src/QEq.h b/sPuReMD/src/qeq.h
similarity index 100%
rename from sPuReMD/src/QEq.h
rename to sPuReMD/src/qeq.h
diff --git a/sPuReMD/src/random.c b/sPuReMD/src/random.c
index f3a5096c65485111fb5cba7321976518a2e42972..9b09e7526b7a8418470cbf8c1b45bd1940dcbfa9 100644
--- a/sPuReMD/src/random.c
+++ b/sPuReMD/src/random.c
@@ -19,7 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include <random.h>
+#include "random.h"
+
 
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
@@ -53,5 +54,5 @@ double GRandom(double mean, double sigma)
         rsq = v1 * v1 + v2 * v2;
     }
 
-    return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
 }