diff --git a/.gitignore b/.gitignore
index c07309633e9f7eb78b1f60a5ce45a617a8a79f8c..a7a8c79a10bd7655faf8017b8939920efe735fa4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
+# PuReMD
 *.log
+*.prs
 *.pot
 *.trj
 
@@ -15,10 +17,13 @@
 *.log
 *.toc
 *.pdf
+_minted-*
 
 # ViM, emacs, nano, leafpad
+tags
 *.swp
 *~
+tags
 
 # Python
 *.pyc
@@ -44,6 +49,7 @@ libtool
 Makefile
 Makefile.in
 stamp-h1
+test-driver
 
 # Compiled languages (C,C++,Fortran,...)
 *.o
@@ -54,7 +60,12 @@ stamp-h1
 *.tab.c
 *.tab.h
 
+# PBS/torque job logs
+*.o[0-9]*
+*.e[0-9]*
+
 # General
 *.tar.gz
 *.pdf
 */bin
+*.txt
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..1129b088fbb36485d7dfa6c5a3f4740bdf9c8e01
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "PG-PuReMD/src/cub"]
+	path = PG-PuReMD/src/cub
+	url = https://github.com/NVlabs/cub.git
+	branch = 89de7ab20167909bc2c4f8acd397671c47cf3c0d
diff --git a/Makefile.am b/Makefile.am
index f777f20c02953a750e90ac8e7767e82377d52cff..0baa2db6e9283fd6a0901199102f9f4709914683 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,22 +2,27 @@ ACLOCAL_AMFLAGS = -I m4
 
 SUBDIRS =
 DIST_SUBDIRS =
+EXTRA_DIST = data environ tools
 
 if BUILD_S_OMP
 SUBDIRS += sPuReMD
 DIST_SUBDIRS += sPuReMD
 endif
+
 if BUILD_MPI
 SUBDIRS += PuReMD
 DIST_SUBDIRS += PuReMD
 endif
+
 if BUILD_GPU
 SUBDIRS += PuReMD-GPU
 DIST_SUBDIRS += PuReMD-GPU
 endif
+
 if BUILD_MPI_GPU
 SUBDIRS += PG-PuReMD
 DIST_SUBDIRS += PG-PuReMD
+EXTRA_DIST += PG-PuReMD/src/cub
 endif
 
 if BUILD_DOC
@@ -25,7 +30,5 @@ SUBDIRS += doc
 DIST_SUBDIRS += doc
 endif
 
-EXTRA_DIST = data environ tools
-
 dist-hook:
 	rm -rf `find $(distdir) -name .git`
diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index 706195a0c4e097aeb4fc6a35f3b0b4de186aa808..d16fd2499993a8a324e414f265d14da31057eea0 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -5,19 +5,13 @@ SUFFIXES = .cu
 include ../cuda.am
 endif
 
-AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)
-AM_CPPFLAGS =
-AM_LDFLAGS = $(MPI_LDFLAGS)
-
 if USE_CUDA
-# default CUDA nvcc flags
-#   Note: cc 13 for Tesla
-#   Note: cc 20 for Fermi
-#   Note: cc 30 for Kepler K10
-#   Note: cc 35 for Kepler K20
-NVCCFLAGS += -use_fast_math 
-NVCCFLAGS += -gencode arch=compute_35,code=sm_35
-NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
+# flags for CUDA compilation via NVCC (see cuda.am)
+#   Note: cc 13 for Tesla, cc 20 for Fermi, cc 30 for Kepler K10, cc 35 for Kepler K20/K40, etc.
+#NVCCFLAGS += -use_fast_math -gencode arch=compute_35,code=sm_35
+NVCCFLAGS += -use_fast_math $(NVCC_OPT_CODE)
+#NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
+NVCCFLAGS += --compiler-options "$(DEFS) $(NVCC_OPT_CODE_DEFS) -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
 #NVCCFLAGS += --ptxas-options -v
 endif
 
@@ -26,57 +20,51 @@ bin_PROGRAMS = bin/pg-puremd
 bin_pg_puremd_SOURCES = src/allocate.c src/basic_comm.c src/ffield.c src/grid.c src/list.c \
 	src/lookup.c src/io_tools.c src/reset_tools.c src/restart.c src/random.c \
 	src/tool_box.c src/traj.c src/analyze.c src/box.c src/system_props.c \
-	src/control.c src/comm_tools.c src/geo_tools.c src/linear_solvers.c src/neighbors.c \
-	src/qEq.c src/bond_orders.c src/multi_body.c src/bonds.c src/valence_angles.c \
+	src/control.c src/comm_tools.c src/geo_tools.c src/lin_alg.c src/neighbors.c \
+	src/charges.c src/bond_orders.c src/multi_body.c src/bonds.c src/valence_angles.c \
 	src/hydrogen_bonds.c src/torsion_angles.c src/nonbonded.c src/forces.c \
 	src/integrate.c src/init_md.c src/parallelreax.c
-include_HEADERS = src/reax_types.h \
+include_HEADERS = src/reax_types.h src/index_utils.h \
         src/allocate.h src/basic_comm.h src/ffield.h src/grid.h src/list.h \
-	src/lookup.h src/io_tools.h src/reset_tools.h src/restart.h src/random.h \
+	src/lookup.h src/io_tools.h src/reset_tools.h src/restart.h src/random.h src/vector.h \
 	src/tool_box.h src/traj.h src/analyze.h src/box.h src/system_props.h \
-	src/control.h src/comm_tools.h src/geo_tools.h src/linear_solvers.h src/neighbors.h \
-	src/qEq.h src/bond_orders.h src/multi_body.h src/bonds.h src/valence_angles.h \
+	src/control.h src/comm_tools.h src/geo_tools.h src/lin_alg.h src/neighbors.h \
+	src/charges.h src/bond_orders.h src/multi_body.h src/bonds.h src/valence_angles.h \
 	src/hydrogen_bonds.h src/torsion_angles.h src/nonbonded.h src/forces.h \
 	src/integrate.h src/init_md.h
-bin_pg_puremd_LDADD = src/vector.o
 
 if USE_CUDA
-bin_pg_puremd_SOURCES += src/cuda_utils.cu src/dev_alloc.cu src/cuda_environment.cu \
-      src/dev_system_props.cu src/reduction.cu src/center_mass.cu \
-      src/cuda_copy.cu src/cuda_reset_tools.cu src/dev_list.cu \
-      src/cuda_neighbors.cu src/cuda_bond_orders.cu src/cuda_bonds.cu \
-      src/cuda_multi_body.cu src/cuda_valence_angles.cu \
-      src/cuda_torsion_angles.cu src/cuda_hydrogen_bonds.cu src/cuda_forces.cu \
-      src/cuda_qEq.cu src/cuda_linear_solvers.cu src/matvec.cu src/dual_matvec.cu \
-      src/cuda_nonbonded.cu src/cuda_integrate.cu src/cuda_post_evolve.cu \
-      src/cuda_init_md.cu src/validation.cu src/cuda_lookup.cu
-include_HEADERS += src/cuda_utils.h src/dev_alloc.h src/cuda_environment.h \
-      src/dev_system_props.h src/reduction.h src/center_mass.h \
-      src/cuda_copy.h src/cuda_reset_tools.h src/dev_list.h \
-      src/cuda_neighbors.h src/cuda_bond_orders.h src/cuda_bonds.h \
-      src/cuda_multi_body.h src/cuda_valence_angles.h \
-      src/cuda_torsion_angles.h src/cuda_hydrogen_bonds.h src/cuda_forces.h \
-      src/cuda_qEq.h src/cuda_linear_solvers.h src/matvec.h src/dual_matvec.h \
-      src/cuda_nonbonded.h src/cuda_integrate.h src/cuda_post_evolve.h \
-      src/cuda_init_md.h src/validation.h src/cuda_lookup.h
+bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \
+      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu src/cuda/cuda_list.cu \
+      src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_random.cu \
+      src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \
+      src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \
+      src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
+      src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \
+      src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \
+      src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
+include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
+      src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
+      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \
+      src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_random.h src/cuda/cuda_vector.h \
+      src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \
+      src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \
+      src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
+      src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \
+      src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \
+      src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
 
-src_vector.o:
-	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -maxrregcount=$(MAX_REG_COUNT) -o src/vector.o -c src/vector.c
-else
-src_vector.o:
-	$(AM_V_CC)$(CC) $(DEFS) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CCFLAGS) \
-	-maxrregcount=$(MAX_REG_COUNT) -o src/vector.o -c src/vector.c
 endif
 
 
-bin_pg_puremd_CFLAGS = $(AM_CFLAGS) $(CFLAGS)
+bin_pg_puremd_CFLAGS = $(AM_CFLAGS) -Wall -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS) $(CFLAGS)
 bin_pg_puremd_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
-bin_pg_puremd_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS)
+bin_pg_puremd_LDADD = $(AM_LDADD) $(MPI_LIBS) $(LDADD) -lstdc++
 
 if USE_CUDA
 bin_pg_puremd_CFLAGS += $(CUDA_CFLAGS)
-bin_pg_puremd_LDFLAGS += $(CUDA_LIBS)
+bin_pg_puremd_LDADD += $(CUDA_LIBS)
 endif
diff --git a/PG-PuReMD/aclocal.m4 b/PG-PuReMD/aclocal.m4
index 1e5be5b9d14bb80e27bd6d609493497e00315d8b..06c98b555988b39d51bea5cdf86e49f9a461f742 100644
--- a/PG-PuReMD/aclocal.m4
+++ b/PG-PuReMD/aclocal.m4
@@ -1151,4 +1151,5 @@ AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
 m4_include([../m4/acx_mpi.m4])
+m4_include([../m4/ax_compiler_vendor.m4])
 m4_include([../m4/ax_cuda.m4])
diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac
index 68be6a512fa8fae4b4f5dcc5401885df1b2fe1ae..6e9f3fb5b91f9727f5762fc23bf9d2d400a10ee4 100644
--- a/PG-PuReMD/configure.ac
+++ b/PG-PuReMD/configure.ac
@@ -4,6 +4,8 @@
 AC_PREREQ([2.69])
 
 AC_INIT([PG-PuReMD], [1.0], [ohearnku@msu.edu hma@msu.edu])
+# Do not allow AC_PROG_CC to set CFLAGS (this line must be after AC_INIT but before AC_PROG_CC)
+sav_CFLAGS="$CFLAGS"
 : ${CFLAGS=""}
 AM_INIT_AUTOMAKE([1.15 subdir-objects -Wall -Werror foreign])
 # Enable silent build rules by default.
@@ -13,20 +15,23 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])], [AC_SUBST([AM_DEFAULT_VERB
 
 AC_CONFIG_MACRO_DIR([../m4])
 
-AC_LANG([C])
-
-AC_CONFIG_SRCDIR([src/torsion_angles.h])
-AC_CONFIG_HEADERS([src/config.h])
-
 # Headline formatter
 AC_DEFUN([CONFIGURE_HEADLINE],
 [
         echo; echo "+++ $1 +++"
 ])
 
+AC_LANG([C])
+
 # Checks for programs.
-AC_PROG_CC([icc gcc cc])
+AC_PROG_CC([icc gcc clang cc])
 AC_PROG_CPP
+CFLAGS="$sav_CFLAGS"
+
+AX_COMPILER_VENDOR
+
+AC_CONFIG_SRCDIR([src/torsion_angles.h])
+AC_CONFIG_HEADERS([src/config.h])
 
 # Checks for libraries.
 AC_SEARCH_LIBS([exp], [m])
@@ -51,7 +56,7 @@ AC_CHECK_FUNCS([gettimeofday memset])
 
 # Check for MPI support.
 CONFIGURE_HEADLINE([ MPI compiler ])
-ACX_MPI([], [AC_MSG_ERROR([could not find mpi library])])
+ACX_MPI([], [AC_MSG_ERROR([could not find MPI library])])
 AC_CHECK_PROG(MPIRUN, mpirun, mpirun)
 AC_SUBST(MPIRUN)
 
@@ -62,24 +67,26 @@ CC="$MPICC"
 CFLAGS="$CFLAGS"
 AC_CHECK_DECL([OPEN_MPI], [mpi_vendor="OpenMPI"],
 	     [], [#include "mpi.h"])
-AC_CHECK_DECL([MPICH2], [mpi_vendor="MPICH"],
+# MPICH v2
+AC_CHECK_DECL([MPICH2], [mpi_vendor="MPICH2"],
+	     [], [#include "mpi.h"])
+# MPICH v3
+AC_CHECK_DECL([MPICH_VERSION], [mpi_vendor="MPICH3"],
 	     [], [#include "mpi.h"])
 CC="$sav_CC"
 CFLAGS="$sav_CFLAGS"
 
-#
-# try to set MPI_CFLAGS and MPI_LDFLAGS
-#
+# try to set MPI_CFLAGS and MPI_LIBS
 MPI_CFLAGS=
-MPI_LDFLAGS=
+MPI_LIBS=
 if test "$mpi_vendor" = "OpenMPI"
 then 
 	MPI_CFLAGS=`$MPICC --showme:compile`
-	MPI_LDFLAGS=`$MPICC --showme:link`
+	MPI_LIBS=`$MPICC --showme:link`
 	AC_MSG_NOTICE([OpenMPI found])
 	AC_MSG_NOTICE([MPI_CFLAGS=$MPI_CFLAGS])
-	AC_MSG_NOTICE([MPI_LDFLAGS=$MPI_LDFLAGS])
-elif test "$mpi_vendor" = "MPICH"
+	AC_MSG_NOTICE([MPI_LIBS=$MPI_LIBS])
+elif test "$mpi_vendor" = "MPICH2" || test "$mpi_vendor" = "MPICH3"
 then
 	# build MPI_CFLAGS
 	tmp=`$MPICC -compile-info | awk '{$1=""; print $0 }'`
@@ -92,30 +99,40 @@ then
 				;;
 		esac
 	done
-	# build MPI_LDFLAGS
+	# build MPI_LIBS
 	tmp=`$MPICC -link-info | awk '{$1=""; print $0 }'`
 	for i in $tmp
 	do 
 		case $i in 
 			[[\\/]]*.a | ?:[[\\/]]*.a | -[[lLRu]]* | -Wl* )
-				MPI_LDFLAGS="$MPI_LDFLAGS $i"
+				MPI_LIBS="$MPI_LIBS $i"
 				;;
 		esac
 	done
 	AC_MSG_NOTICE([MPICH found])
 	AC_MSG_NOTICE([MPI_CFLAGS=$MPI_CFLAGS])
-	AC_MSG_NOTICE([MPI_LDFLAGS=$MPI_LDFLAGS])
+	AC_MSG_NOTICE([MPI_LIBS=$MPI_LIBS])
 else
 	AC_MSG_WARN([Neither OpenMPI and MPICH have been recognized...])
 fi
 AC_SUBST(MPI_CFLAGS)
-AC_SUBST(MPI_LDFLAGS)
+AC_SUBST(MPI_LIBS)
+
+AC_LANG([C++])
+
+# Checks for programs.
+AC_PROG_CXX([icpc g++ clang++ c++])
+AC_PROG_CXXCPP
+
+AX_COMPILER_VENDOR
 
 # Check for CUDA support.
 if test "x$BUILD_GPU" = "xyes"; then
 	CONFIGURE_HEADLINE([ CUDA support ])
 	AX_CUDA
+
         NVCCFLAGS=
+	NVCCFLAGS+=" -ccbin=$CXX"
 	if test "BUILD_DEBUG" = "true"
 	then
 		NVCCFLAGS+=" -g -G"
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 80b87c138311aa8d22b686b9afb261dd93fafc0b..039e65d28d4c5860ef83337be2086c203a8ae628 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -20,59 +20,50 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "allocate.h"
-#include "list.h"
-#include "reset_tools.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "allocate.h"
+  #include "list.h"
+  #include "reset_tools.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_allocate.h"
-#include "reax_list.h"
-#include "reax_reset_tools.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_allocate.h"
+  #include "reax_list.h"
+  #include "reax_reset_tools.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 /* allocate space for my_atoms
    important: we cannot know the exact number of atoms that will fall into a
    process's box throughout the whole simulation. therefore
    we need to make upper bound estimates for various data structures */
 int PreAllocate_Space( reax_system *system, control_params *control,
-                       storage *workspace )
+        storage *workspace )
 {
-    int  i;
-
     /* determine capacity based on box vol & est atom volume */
     system->local_cap = MAX( (int)(system->n * SAFE_ZONE), MIN_CAP );
     system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
+
 #if defined(DEBUG)||defined(__CUDA_DEBUG_LOG__)
     fprintf( stderr, "p%d: local_cap=%d total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     system->my_atoms = (reax_atom*)
-                       scalloc(system->total_cap, sizeof(reax_atom), "my_atoms");
+            scalloc( system->total_cap, sizeof(reax_atom), "my_atoms" );
 
     /* space for keeping restriction info, if any */
     if ( control->restrict_bonds )
     {
         workspace->restricted  =
-            (int*) scalloc( system->local_cap, sizeof(int), "restricted_atoms" );
-
-        /*
-         * SUDHIR
-        workspace->restricted_list =
-         (int**) scalloc( system->local_cap, sizeof(int*), "restricted_list" );
-
-        for( i = 0; i < system->local_cap; ++i )
-         workspace->restricted_list[i] =
-        (int*) scalloc( MAX_RESTRICT, sizeof(int), "restricted_list[i]" );
-        */
-        workspace->restricted_list =
-            (int *) scalloc (system->local_cap * MAX_RESTRICT, sizeof (int), "restricted_list" );
+                (int*) scalloc( system->local_cap, sizeof(int), "restricted_atoms" );
+        workspace->restricted_list = (int *)
+                scalloc( system->local_cap * MAX_RESTRICT, sizeof(int), "restricted_list" );
     }
 
     return SUCCESS;
@@ -80,38 +71,11 @@ int PreAllocate_Space( reax_system *system, control_params *control,
 
 
 /*************       system        *************/
-inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
-{
-    dest->orig_id = src->orig_id;
-    dest->type = src->type;
-    strcpy( dest->name, src->name );
-    rvec_Copy( dest->x, src->x );
-    rvec_Copy( dest->v, src->v );
-    rvec_Copy( dest->f_old, src->f_old );
-    rvec_Copy( dest->s, src->s );
-    rvec_Copy( dest->t, src->t );
-    dest->Hindex = src->Hindex;
-    dest->num_bonds = src->num_bonds;
-    dest->num_hbonds = src->num_hbonds;
-}
-
-
-void Copy_Atom_List( reax_atom *dest, reax_atom *src, int n )
-{
-    int i;
-
-    for ( i = 0; i < n; ++i )
-        memcpy( dest + i, src + i, sizeof(reax_atom) );
-}
-
-
-int Allocate_System( reax_system *system, int local_cap, int total_cap,
-                     char *msg )
+void Allocate_System( reax_system *system, int local_cap, int total_cap,
+        char *msg )
 {
     system->my_atoms = (reax_atom*)
-                       realloc( system->my_atoms, total_cap * sizeof(reax_atom) );
-
-    return SUCCESS;
+            srealloc( system->my_atoms, total_cap * sizeof(reax_atom), "system:my_atoms" );
 }
 
 
@@ -120,10 +84,12 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
 {
     int i;
 
-    if ( !workspace->allocated )
+    if ( workspace->allocated == FALSE )
+    {
         return;
+    }
 
-    workspace->allocated = 0;
+    workspace->allocated = FALSE;
 
     /* communication storage */
     for ( i = 0; i < MAX_NBRS; ++i )
@@ -194,7 +160,7 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     // sfree( workspace->f_old );
     sfree( workspace->v_const, "v_const" );
 
-    /*workspace->realloc.num_far = -1;
+    /*workspace->realloc.far_nbrs = -1;
       workspace->realloc.Htop = -1;
       workspace->realloc.hbonds = -1;
       workspace->realloc.bonds = -1;
@@ -209,11 +175,14 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     }
 
     if ( control->diffusion_coef )
+    {
         sfree( workspace->x_old, "x_old" );
+    }
 
     /* force related storage */
     sfree( workspace->f, "f" );
     sfree( workspace->CdDelta, "CdDelta" );
+
 #ifdef TEST_FORCES
     sfree(workspace->dDelta, "dDelta" );
     sfree( workspace->f_ele, "f_ele" );
@@ -236,27 +205,17 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->id_all, "id_all" );
     sfree( workspace->f_all, "f_all" );
 #endif
-
-    /* hbond storage */
-    //sfree( workspace->Hindex, "Hindex" );
-    //sfree( workspace->num_bonds );
-    //sfree( workspace->num_hbonds );
-    //sfree( workspace->hash, "hash" );
-    //sfree( workspace->rev_hash, "rev_hash" );
 }
 
 
-int Allocate_Workspace( reax_system *system, control_params *control,
-                        storage *workspace, int local_cap, int total_cap,
-                        char *msg )
+void Allocate_Workspace( reax_system *system, control_params *control,
+        storage *workspace, int local_cap, int total_cap, char *msg )
 {
-    int i, total_real, total_rvec, local_int, local_real, local_rvec;
+    int i, total_real, total_rvec, local_rvec;
 
-    workspace->allocated = 1;
+    workspace->allocated = TRUE;
     total_real = total_cap * sizeof(real);
     total_rvec = total_cap * sizeof(rvec);
-    local_int = local_cap * sizeof(int);
-    local_real = local_cap * sizeof(real);
     local_rvec = local_cap * sizeof(rvec);
 
     /* communication storage */
@@ -287,9 +246,25 @@ int Allocate_Workspace( reax_system *system, control_params *control,
     workspace->vlpex = (real*) smalloc( total_real, "vlpex" );
     workspace->bond_mark = (int*) scalloc(total_cap, sizeof(int), "bond_mark");
     workspace->done_after = (int*) scalloc(total_cap, sizeof(int), "done_after");
-    // fprintf( stderr, "p%d: bond order storage\n", system->my_rank );
 
-    /* QEq storage */
+    /* charge method storage */
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            system->N_cm = system->N;
+            break;
+        case EE_CM:
+            system->N_cm = system->N + 1;
+            break;
+        case ACKS2_CM:
+            system->N_cm = 2 * system->N + 2;
+            break;
+        default:
+            fprintf( stderr, "[ERROR] Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
     workspace->Hdia_inv = (real*) scalloc( total_cap, sizeof(real), "Hdia_inv" );
     workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s" );
     workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t" );
@@ -297,39 +272,56 @@ int Allocate_Workspace( reax_system *system, control_params *control,
     workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm" );
     workspace->s = (real*) scalloc( total_cap, sizeof(real), "s" );
     workspace->t = (real*) scalloc( total_cap, sizeof(real), "t" );
-    workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" );
+    }
     workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b" );
     workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x" );
 
-    /* GMRES storage */
-    workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" );
-    workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" );
-    workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" );
-    //SUHDIR
-    //workspace->h = (real**) scalloc( RESTART+1, sizeof(real*), "h" );
-    workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h");
-    workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" );
-    workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" );
-    //SUDHIR
-    //workspace->v = (real**) scalloc( RESTART+1, sizeof(real*), "v" );
-    workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v");
-
-    /*
-    for( i = 0; i < RESTART+1; ++i ) {
-      workspace->h[i] = (real*) scalloc( RESTART+1, sizeof(real), "h[i]" );
-      workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]" );
+    switch ( control->cm_solver_type )
+    {
+        /* GMRES storage */
+        case GMRES_S:
+        case GMRES_H_S:
+            workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" );
+            workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" );
+            workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" );
+            workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h");
+            workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" );
+            workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" );
+            workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v");
+            break;
+
+        /* CG storage */
+        case CG_S:
+            workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
+            workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
+            workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
+            workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
+            workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
+            workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
+            workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
+            workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
+            break;
+
+        case SDM_S:
+            workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
+            workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
+            workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
+            workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
+            workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
+            workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
+            workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
+            workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method linear solver type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
-    */
-
-    /* CG storage */
-    workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
-    workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
-    workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
-    workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
-    workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
-    workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
-    workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
-    workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
 
     /* integrator storage */
     workspace->v_const = (rvec*) smalloc( local_rvec, "v_const" );
@@ -341,11 +333,18 @@ int Allocate_Workspace( reax_system *system, control_params *control,
         workspace->old_mark = (int*) scalloc( local_cap, sizeof(int), "old_mark" );
     }
     else
+    {
         workspace->mark = workspace->old_mark = NULL;
+    }
 
     if ( control->diffusion_coef )
+    {
         workspace->x_old = (rvec*) scalloc( local_cap, sizeof(rvec), "x_old" );
-    else workspace->x_old = NULL;
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
     /* force related storage */
     workspace->f = (rvec*) scalloc( total_cap, sizeof(rvec), "f" );
@@ -370,8 +369,8 @@ int Allocate_Workspace( reax_system *system, control_params *control,
 
     if ( system->my_rank == MASTER_NODE )
     {
-        workspace->rcounts = (int*) smalloc(system->wsize * sizeof(int), "rcount");
-        workspace->displs = (int*) smalloc(system->wsize * sizeof(int), "displs");
+        workspace->rcounts = (int*) smalloc(system->nprocs * sizeof(int), "rcount");
+        workspace->displs = (int*) smalloc(system->nprocs * sizeof(int), "displs");
         workspace->id_all = (int*) smalloc(system->bigN * sizeof(int), "id_all");
         workspace->f_all = (rvec*) smalloc(system->bigN * sizeof(rvec), "f_all");
     }
@@ -383,143 +382,81 @@ int Allocate_Workspace( reax_system *system, control_params *control,
         workspace->f_all = NULL;
     }
 #endif
-
-    return SUCCESS;
 }
 
 
 void Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs )
 {
-    Delete_List( far_nbrs);
-
-    if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs))
-    {
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Delete_List( far_nbrs );
+    Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs );
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs )
+void Allocate_Matrix( sparse_matrix *H, int n, int m )
 {
-    Dev_Delete_List( far_nbrs);
-
-    if (!Dev_Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs))
-    {
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-}
-#endif
-
-
-/*
-int Allocate_Matrix( sparse_matrix **pH, int cap, int m )
-{
-  sparse_matrix *H;
-
-  *pH = (sparse_matrix*) smalloc(sizeof(sparse_matrix), "sparse_matrix");
-  H = *pH;
-  H->cap = cap;
-  H->m = m;
-  H->start = (int*) smalloc(sizeof(int) * cap, "matrix_start");
-  H->end = (int*) smalloc(sizeof(int) * cap, "matrix_end");
-  H->entries = (sparse_matrix_entry*)
-    smalloc(sizeof(sparse_matrix_entry)*m, "matrix_entries");
-
-  return SUCCESS;
-}
-*/
-
-int Allocate_Matrix( sparse_matrix *H, int cap, int m )
-{
-
-   // H = (sparse_matrix*) smalloc(sizeof(sparse_matrix), "sparse_matrix");
-    H->cap = cap;
+    H->n = n;
     H->m = m;
 
-    H->start = (int*) smalloc(sizeof(int) * cap, "matrix_start");
-    H->end = (int*) smalloc(sizeof(int) * cap, "matrix_end");
+    H->start = (int*) smalloc( sizeof(int) * n, "Allocate_Matrix::start" );
+    H->end = (int*) smalloc( sizeof(int) * n, "Allocate_Matrix::end" );
     H->entries = (sparse_matrix_entry*)
-                 smalloc(sizeof(sparse_matrix_entry) * m, "matrix_entries");
-
-    return SUCCESS;
-
+        smalloc( sizeof(sparse_matrix_entry) * m, "Allocate_Matrix::entries" );
 }
 
 
 void Deallocate_Matrix( sparse_matrix *H )
 {
-    sfree(H->start, "H->start");
-    sfree(H->end, "H->end");
-    sfree(H->entries, "H->entries");
-    sfree(H, "H");
+    sfree( H->start, "Deallocate_Matrix::start" );
+    sfree( H->end, "Deallocate_Matrix::end" );
+    sfree( H->entries, "Deallocate_Matrix::entries" );
+    sfree( H, "Deallocate_Matrix::matrix" );
 }
 
 
-int Reallocate_Matrix( sparse_matrix **H, int n, int m, char *name )
+static void Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 {
-    Deallocate_Matrix( *H );
-    if ( !Allocate_Matrix( H, n, m ) )
-    {
-        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-        exit( 1 );
-    }
+    Deallocate_Matrix( H );
+
+    Allocate_Matrix( H, n, m );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", name, n, m );
     fprintf( stderr, "memory allocated: %s = %dMB\n",
-             name, (int)(m * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+            name, (int)(m * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
 #endif
-    return SUCCESS;
 }
 
 
-int Reallocate_HBonds_List( reax_system *system, reax_list *hbonds )
+void Reallocate_HBonds_List( reax_system *system, reax_list *hbonds )
 {
     int i, id, total_hbonds;
 
     total_hbonds = 0;
     for ( i = 0; i < system->n; ++i )
+    {
         if ( (id = system->my_atoms[i].Hindex) >= 0 )
         {
             system->my_atoms[i].num_hbonds = MAX( Num_Entries(id, hbonds) * SAFER_ZONE,
-                                                  MIN_HBONDS );
+                    MIN_HBONDS );
             total_hbonds += system->my_atoms[i].num_hbonds;
         }
+    }
     total_hbonds = MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS );
 
-    Delete_List( hbonds);
-    if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND, hbonds) )
-    {
-        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Delete_List( hbonds );
 
-    return total_hbonds;
+    Make_List( system->Hcap, total_hbonds, TYP_HBOND, hbonds);
 }
 
-#ifdef HAVE_CUDA
-int Cuda_Reallocate_HBonds_List( int n, int num_intrs, reax_list *hbonds )
-{
-    Dev_Delete_List( hbonds);
-    if ( !Dev_Make_List( n, num_intrs, TYP_HBOND, hbonds) )
-    {
-        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-    return SUCCESS;
-}
-#endif
 
-int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
-                           int *total_bonds, int *est_3body )
+void Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
+        int *total_bonds, int *est_3body )
 {
     int i;
 
     *total_bonds = 0;
     *est_3body = 0;
+
     for ( i = 0; i < system->N; ++i )
     {
         *est_3body += SQR( Num_Entries( i, bonds ) );
@@ -528,29 +465,10 @@ int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
     }
     *total_bonds = MAX( *total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS );
 
-    Delete_List( bonds);
-    if (!Make_List(system->total_cap, *total_bonds, TYP_BOND, bonds))
-    {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-
-    return SUCCESS;
-}
+    Delete_List( bonds );
 
-#ifdef HAVE_CUDA
-int Cuda_Reallocate_Bonds_List( int n, int num_intrs, reax_list *bonds)
-{
-    Dev_Delete_List( bonds);
-    if (!Dev_Make_List(n, num_intrs, TYP_BOND, bonds))
-    {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-
-    return SUCCESS;
+    Make_List( system->total_cap, *total_bonds, TYP_BOND, bonds );
 }
-#endif
 
 
 /*************       grid        *************/
@@ -560,65 +478,69 @@ int Estimate_GCell_Population( reax_system* system, MPI_Comm comm )
     ivec c;
     grid *g;
     grid_cell *gc;
-    simulation_box *big_box, *my_ext_box;
+    simulation_box *my_ext_box;
     reax_atom *atoms;
 
-    big_box    = &(system->big_box);
     my_ext_box = &(system->my_ext_box);
-    g          = &(system->my_grid);
-    atoms      = system->my_atoms;
+    g = &(system->my_grid);
+    atoms = system->my_atoms;
     Reset_Grid( g );
 
     for ( l = 0; l < system->n; l++ )
     {
         for ( d = 0; d < 3; ++d )
         {
-            //if( atoms[l].x[d] < big_box->min[d] )
-            //  atoms[l].x[d] += big_box->box_norms[d];
-            //else if( atoms[l].x[d] >= big_box->max[d] )
-            //  atoms[l].x[d] -= big_box->box_norms[d];
-
             c[d] = (int)((atoms[l].x[d] - my_ext_box->min[d]) * g->inv_len[d]);
 
             if ( c[d] >= g->native_end[d] )
+            {
                 c[d] = g->native_end[d] - 1;
+            }
             else if ( c[d] < g->native_str[d] )
+            {
                 c[d] = g->native_str[d];
+            }
         }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d bin_my_atoms: l:%d - atom%d @ %.5f %.5f %.5f" \
-                 "--> cell: %d %d %d\n",
-                 system->my_rank, l, atoms[l].orig_id,
-                 atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
-                 c[0], c[1], c[2] );
+                "--> cell: %d %d %d\n",
+                system->my_rank, l, atoms[l].orig_id,
+                atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
+                c[0], c[1], c[2] );
 #endif
-        //SUDHIR
-        //gc = &( g->cells[c[0]][c[1]][c[2]] );
-        gc = &( g->cells[ index_grid_3d (c[0], c[1], c[2], g) ] );
+
+        gc = &( g->cells[ index_grid_3d(c[0], c[1], c[2], g) ] );
         gc->top++;
     }
 
     max_atoms = 0;
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gc = &(g->cells[i][j][k]);
-                gc = &(g->cells[ index_grid_3d (i, j, k, g) ]);
+                gc = &(g->cells[ index_grid_3d(i, j, k, g) ]);
                 if ( max_atoms < gc->top )
+                {
                     max_atoms = gc->top;
+                }
+
 #if defined(DEBUG)
                 fprintf( stderr, "p%d gc[%d,%d,%d]->top=%d\n",
-                         system->my_rank, i, j, k, gc->top );
+                        system->my_rank, i, j, k, gc->top );
 #endif
             }
+        }
+    }
 
-    my_max = MAX(max_atoms * SAFE_ZONE, MIN_GCELL_POPL);
+    my_max = MAX( max_atoms * SAFE_ZONE, MIN_GCELL_POPL );
     MPI_Allreduce( &my_max, &all_max, 1, MPI_INT, MPI_MAX, comm );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d max_atoms=%d, my_max=%d, all_max=%d\n",
-             system->my_rank, max_atoms, my_max, all_max );
+            system->my_rank, max_atoms, my_max, all_max );
 #endif
 
     return all_max;
@@ -627,92 +549,59 @@ int Estimate_GCell_Population( reax_system* system, MPI_Comm comm )
 
 void Allocate_Grid( reax_system *system, MPI_Comm comm )
 {
-    int i, j, k, l;
+    int i, j, k;
     grid *g;
     grid_cell *gc;
     int total;
 
     g = &( system->my_grid );
-    total = g->ncells [0] * g->ncells[1] * g->ncells[2];
+    total = g->ncells[0] * g->ncells[1] * g->ncells[2];
 
     /* allocate gcell reordering space */
     g->order = (ivec*) scalloc( g->total + 1, sizeof(ivec), "g:order" );
 
     /* allocate the gcells for the new grid */
     g->max_nbrs = (2 * g->vlist_span[0] + 1) * (2 * g->vlist_span[1] + 1) *
-                  (2 * g->vlist_span[2] + 1) + 3;
+            (2 * g->vlist_span[2] + 1) + 3;
 
-    //SUDHIR
-    /*
-    g->cells = (grid_cell***)
-    scalloc(g->ncells[0], sizeof(grid_cell**), "gcells");
-    for( i = 0; i < g->ncells[0]; i++ ) {
-    g->cells[i] = (grid_cell**)
-      scalloc(g->ncells[1], sizeof(grid_cell*),"gcells[i]");
-
-    for( j = 0; j < g->ncells[1]; ++j ) {
-      g->cells[i][j] = (grid_cell*)
-    scalloc(g->ncells[2], sizeof(grid_cell), "gcells[i][j]");
-
-      for( k = 0; k < g->ncells[2]; k++ ) {
-    gc = &(g->cells[i][j][k]);
-    gc->top = gc->mark = gc->str = gc->end = 0;
-    gc->nbrs = (grid_cell**)
-      scalloc(g->max_nbrs, sizeof(grid_cell*), "g:nbrs");
-    gc->nbrs_x = (ivec*) scalloc(g->max_nbrs, sizeof(ivec), "g:nbrs_x");
-    gc->nbrs_cp = (rvec*) scalloc(g->max_nbrs, sizeof(rvec), "g:nbrs_cp");
-    for( l = 0; l < g->max_nbrs; ++l )
-      gc->nbrs[l] = NULL;
-      }
-    }
-    }
-    */
+    g->cells = (grid_cell *) scalloc( total, sizeof(grid_cell), "g:gcell" );
 
-    //SUDHIR
-    g->cells = (grid_cell *)
-               scalloc (g->ncells[0] * g->ncells[1] * g->ncells[2], sizeof (grid_cell), "g:gcell" );
-    for (i = 0; i < g->ncells[0] * g->ncells[1] * g->ncells[2]; i++)
+    for (i = 0; i < total; i++)
     {
-        gc = &( g->cells[ i ]);
-        gc->top = gc->mark = 0;
-        /*
-        gc->str = gc->end = 0;
-        gc->nbrs_x = (ivec*) scalloc(g->max_nbrs, sizeof(ivec), "g:nbrs_x");
-            for (j = 0; j < g->max_nbrs; j++) {
-                gc->nbrs_x[j][0] = -1;
-                gc->nbrs_x[j][1] = -1;
-                gc->nbrs_x[j][2] = -1;
-            }
-        gc->nbrs_cp = (rvec*) scalloc(g->max_nbrs, sizeof(rvec), "g:nbrs_cp");
-        gc->nbrs = (grid_cell **)
-            scalloc(g->max_nbrs, sizeof(grid_cell*), "g:nbrs");
-        for( l = 0; l < g->max_nbrs; ++l )
-            gc->nbrs[l] = NULL;
-            */
+        gc = &( g->cells[ i ] );
+        gc->top = 0;
+        gc->mark = 0;
     }
-    //SUDHIR
-    g->str = (int *) scalloc (total, sizeof (int ), "grid:str");
-    g->end = (int *) scalloc (total, sizeof (int ), "grid:end" );
-    g->cutoff = (real *) scalloc (total, sizeof (real ), "grid:cutoff");
-    g->nbrs_x = (ivec *) scalloc (total * g->max_nbrs, sizeof (ivec ), "grid:nbrs_x");
-    g->nbrs_cp = (rvec *) scalloc (total * g->max_nbrs, sizeof (rvec ), "grid:nbrs_cp");
-    for (i = 0; i < total * g->max_nbrs; i++)
+
+    g->str = (int *) scalloc( total, sizeof(int), "grid:str" );
+    g->end = (int *) scalloc( total, sizeof(int), "grid:end" );
+    g->cutoff = (real *) scalloc( total, sizeof(real), "grid:cutoff" );
+    g->nbrs_x = (ivec *) scalloc( total * g->max_nbrs, sizeof(ivec), "grid:nbrs_x" );
+    g->nbrs_cp = (rvec *) scalloc( total * g->max_nbrs, sizeof(rvec), "grid:nbrs_cp" );
+    for ( i = 0; i < total * g->max_nbrs; i++ )
     {
-        g->nbrs_x [i][0] = g->nbrs_x [i][1] = g->nbrs_x [i][2] =  -1;
+        g->nbrs_x[i][0] = -1;
+        g->nbrs_x[i][1] = -1;
+        g->nbrs_x[i][2] = -1;
     }
-    g->rel_box = (ivec *) scalloc (total, sizeof (ivec), "grid:rel_box");
-
-    //fprintf (stderr, "nbrs_cp allocated : %d \n", g->max_nbrs * total );
+    g->rel_box = (ivec *) scalloc( total, sizeof (ivec), "grid:rel_box" );
 
     /* allocate atom id storage in gcells */
     g->max_atoms = Estimate_GCell_Population( system, comm );
+
     /* space for storing atom id's is required only for native cells */
     for ( i = g->native_str[0]; i < g->native_end[0]; ++i )
+    {
         for ( j = g->native_str[1]; j < g->native_end[1]; ++j )
+        {
             for ( k = g->native_str[2]; k < g->native_end[2]; ++k )
-                //g->cells[i][j][k].atoms = (int*) scalloc(g->max_atoms, sizeof(int), "g:atoms");
-                //SUDHIR
-                g->cells[ index_grid_3d (i, j, k, g) ].atoms = (int*) scalloc(g->max_atoms, sizeof(int), "g:atoms");
+            {
+                g->cells[ index_grid_3d(i, j, k, g) ].atoms =
+                    (int*) scalloc( g->max_atoms, sizeof(int), "g:atoms" );
+            }
+        }
+    }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d-allocated %dx%dx%d grid: nbrs=%d atoms=%d space=%dMB\n",
              system->my_rank, g->ncells[0], g->ncells[1], g->ncells[2],
@@ -748,18 +637,13 @@ void Deallocate_Grid( grid *g )
         {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gc = &(g->cells[i][j][k]);
-                gc = &(g->cells[ index_grid_3d (i, j, k, g)] );
-                //sfree( gc->nbrs, "g:nbrs" );
-                //sfree( gc->nbrs_x, "g:nbrs_x" );
-                //sfree( gc->nbrs_cp, "g:nbrs_cp" );
-                if (gc->atoms != NULL )
+                gc = &(g->cells[ index_grid_3d(i, j, k, g)] );
+                if ( gc->atoms != NULL )
+                {
                     sfree( gc->atoms, "g:atoms" );
+                }
             }
-            //sfree( g->cells[i][j], "g:cells[i][j]" );
         }
-        //sfree( g->cells[i], "g:cells[i]" );
     }
     sfree( g->cells, "g:cells" );
 }
@@ -774,17 +658,17 @@ void Deallocate_Grid( grid *g )
    the largest space by far is required for the 2nd comm operation above.
    buffers are void*, type cast to the correct pointer type to access
    the allocated buffers */
-int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
-                           neighbor_proc *my_nbrs, char *msg )
+void Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
+        neighbor_proc *my_nbrs, char *msg )
 {
     int i;
-    mpi_out_data  *mpi_buf;
+    mpi_out_data *mpi_buf;
 
     /* in buffers */
     mpi_data->in1_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in1_buffer" );
+        scalloc( est_recv, sizeof(boundary_atom), "in1_buffer" );
     mpi_data->in2_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in2_buffer" );
+        scalloc( est_recv, sizeof(boundary_atom), "in2_buffer" );
 
     /* out buffers */
     for ( i = 0; i < MAX_NBRS; ++i )
@@ -792,12 +676,10 @@ int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
         mpi_buf = &( mpi_data->out_buffers[i] );
         /* allocate storage for the neighbor processor i */
         mpi_buf->index = (int*)
-                         scalloc( my_nbrs[i].est_send, sizeof(int), "mpibuf:index" );
+            scalloc( my_nbrs[i].est_send, sizeof(int), "mpibuf:index" );
         mpi_buf->out_atoms = (void*)
-                             scalloc(my_nbrs[i].est_send, sizeof(boundary_atom), "mpibuf:out_atoms");
+            scalloc( my_nbrs[i].est_send, sizeof(boundary_atom), "mpibuf:out_atoms" );
     }
-
-    return SUCCESS;
 }
 
 
@@ -819,11 +701,11 @@ void Deallocate_MPI_Buffers( mpi_datatypes *mpi_data )
 
 
 void ReAllocate( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data )
 {
     int i, j, k, p;
-    int num_bonds, est_3body, nflag, Nflag, Hflag, mpi_flag, ret, total_send;
+    int num_bonds, est_3body, nflag, Nflag, Hflag, mpi_flag, total_send;
     int renbr;
     reallocate_data *realloc;
     reax_list *far_nbrs;
@@ -835,23 +717,24 @@ void ReAllocate( reax_system *system, control_params *control,
 
     realloc = &(workspace->realloc);
     g = &(system->my_grid);
+    H = &workspace->H;
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d@reallocate: n: %d, N: %d, numH: %d\n",
-             system->my_rank, system->n, system->N, system->numH );
+            system->my_rank, system->n, system->N, system->numH );
     fprintf( stderr, "p%d@reallocate: local_cap: %d, total_cap: %d, Hcap: %d\n",
-             system->my_rank, system->local_cap, system->total_cap,
-             system->Hcap);
-    fprintf( stderr, "p%d: realloc.num_far: %d\n",
-             system->my_rank, realloc->num_far );
+            system->my_rank, system->local_cap, system->total_cap,
+            system->Hcap);
+    fprintf( stderr, "p%d: realloc.far_nbrs: %d\n",
+            system->my_rank, realloc->far_nbrs );
     fprintf( stderr, "p%d: realloc.H: %d, realloc.Htop: %d\n",
-             system->my_rank, realloc->H, realloc->Htop );
+            system->my_rank, realloc->H, realloc->Htop );
     fprintf( stderr, "p%d: realloc.Hbonds: %d, realloc.num_hbonds: %d\n",
-             system->my_rank, realloc->hbonds, realloc->num_hbonds );
+            system->my_rank, realloc->hbonds, realloc->num_hbonds );
     fprintf( stderr, "p%d: realloc.bonds: %d, num_bonds: %d\n",
-             system->my_rank, realloc->bonds, realloc->num_bonds );
+            system->my_rank, realloc->bonds, realloc->num_bonds );
     fprintf( stderr, "p%d: realloc.num_3body: %d\n",
-             system->my_rank, realloc->num_3body );
+            system->my_rank, realloc->num_3body );
 #endif
 
     // IMPORTANT: LOOSE ZONES CHECKS ARE DISABLED FOR NOW BY &&'ing with 0!!!
@@ -876,91 +759,78 @@ void ReAllocate( reax_system *system, control_params *control,
         /* system */
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating system and workspace -"\
-                 "n=%d  N=%d  local_cap=%d  total_cap=%d\n",
-                 system->my_rank, system->n, system->N,
-                 system->local_cap, system->total_cap );
+                "n=%d  N=%d  local_cap=%d  total_cap=%d\n",
+                system->my_rank, system->n, system->N,
+                system->local_cap, system->total_cap );
 #endif
-        ret = Allocate_System( system, system->local_cap, system->total_cap, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "not enough space for atom_list: total_cap=%d",
-                     system->total_cap );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
+
+        Allocate_System( system, system->local_cap, system->total_cap, msg );
 
         /* workspace */
         DeAllocate_Workspace( control, workspace );
-        ret = Allocate_Workspace( system, control, workspace, system->local_cap,
-                                  system->total_cap, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "no space for workspace: local_cap=%d total_cap=%d",
-                     system->local_cap, system->total_cap );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
+        Allocate_Workspace( system, control, workspace, system->local_cap,
+                system->total_cap, msg );
     }
 
-
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
     /* far neighbors */
     if ( renbr )
     {
         far_nbrs = *lists + FAR_NBRS;
 
-        if ( Nflag || realloc->num_far >= far_nbrs->num_intrs * DANGER_ZONE )
+        if ( Nflag || realloc->far_nbrs >= far_nbrs->num_intrs * DANGER_ZONE )
         {
-            if ( realloc->num_far > far_nbrs->num_intrs )
-            {
-                fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-                         data->step, realloc->num_far, far_nbrs->num_intrs );
-                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-            }
+//            if ( realloc->far_nbrs > far_nbrs->num_intrs )
+//            {
+//                fprintf( stderr, "[ERROR] step%d-ran out of space on far_nbrs: top=%d, max=%d",
+//                         data->step, realloc->far_nbrs, far_nbrs->num_intrs );
+//                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+//            }
+
 #if defined(DEBUG_FOCUS)
-            fprintf( stderr, "p%d: reallocating far_nbrs: num_fars=%d, space=%dMB\n",
-                     system->my_rank, (int)(realloc->num_far * SAFE_ZONE),
-                     (int)(realloc->num_far * SAFE_ZONE * sizeof(far_neighbor_data) /
+            fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n",
+                     system->my_rank, (int)(realloc->far_nbrs * SAFE_ZONE),
+                     (int)(realloc->far_nbrs * SAFE_ZONE * sizeof(far_neighbor_data) /
                            (1024 * 1024)) );
 #endif
-            Reallocate_Neighbor_List( far_nbrs, system->total_cap,
-                                      realloc->num_far * SAFE_ZONE );
-            realloc->num_far = 0;
+
+            Reallocate_Neighbor_List( far_nbrs, system->total_cap, realloc->far_nbrs * SAFE_ZONE );
+            realloc->far_nbrs = FALSE;
         }
     }
 
-
-    /* qeq coef matrix */
-    //MATRIX CHANGES
-    H = &workspace->H;
+    /* charge coef matrix */
     if ( nflag || realloc->Htop >= H->m * DANGER_ZONE )
     {
-        if ( realloc->Htop > H->m )
-        {
-            fprintf( stderr,
-                     "step%d - ran out of space on H matrix: Htop=%d, max = %d",
-                     data->step, realloc->Htop, H->m );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
+//        if ( realloc->Htop > H->m )
+//        {
+//            fprintf( stderr,
+//                     "step%d - ran out of space on H matrix: Htop=%d, max = %d",
+//                     data->step, realloc->Htop, H->m );
+//            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+//        }
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating H matrix: Htop=%d, space=%dMB\n",
-                 system->my_rank, (int)(realloc->Htop * SAFE_ZONE),
-                 (int)(realloc->Htop * SAFE_ZONE * sizeof(sparse_matrix_entry) /
-                       (1024 * 1024)) );
+                system->my_rank, (int)(realloc->Htop * SAFE_ZONE),
+                (int)(realloc->Htop * SAFE_ZONE * sizeof(sparse_matrix_entry) /
+                (1024 * 1024)) );
 #endif
-        Reallocate_Matrix( &(workspace->H),
-                           system->local_cap, realloc->Htop * SAFE_ZONE, "H" );
+
+        Reallocate_Matrix( H, system->local_cap,
+                realloc->Htop * SAFE_ZONE, "H" );
         //Deallocate_Matrix( workspace->L );
         //Deallocate_Matrix( workspace->U );
 
         //MATRIX-CHANGES
         //workspace->L = NULL;
         //workspace->U = NULL;
+
         realloc->Htop = 0;
     }
 
     /* hydrogen bonds list */
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
     {
         Hflag = 0;
         if ( system->numH >= DANGER_ZONE * system->Hcap ||
@@ -974,12 +844,14 @@ void ReAllocate( reax_system *system, control_params *control,
 
         if ( Hflag || realloc->hbonds )
         {
-            ret = Reallocate_HBonds_List( system, (*lists) + HBONDS );
+            Reallocate_HBonds_List( system, (*lists) + HBONDS );
             realloc->hbonds = 0;
+
 #if defined(DEBUG_FOCUS)
             fprintf(stderr, "p%d: reallocating hbonds: total_hbonds=%d space=%dMB\n",
                     system->my_rank, ret, (int)(ret * sizeof(hbond_data) / (1024 * 1024)));
 #endif
+
         }
     }
 
@@ -990,6 +862,7 @@ void ReAllocate( reax_system *system, control_params *control,
         Reallocate_Bonds_List( system, (*lists) + BONDS, &num_bonds, &est_3body );
         realloc->bonds = 0;
         realloc->num_3body = MAX( realloc->num_3body, est_3body );
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating bonds: total_bonds=%d, space=%dMB\n",
                  system->my_rank, num_bonds,
@@ -1000,385 +873,47 @@ void ReAllocate( reax_system *system, control_params *control,
     /* 3-body list */
     if ( realloc->num_3body > 0 )
     {
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating 3body list: num_3body=%d, space=%dMB\n",
-                 system->my_rank, realloc->num_3body,
-                 (int)(realloc->num_3body * sizeof(three_body_interaction_data) /
-                       (1024 * 1024)) );
+                system->my_rank, realloc->num_3body,
+                (int)(realloc->num_3body * sizeof(three_body_interaction_data) /
+                (1024 * 1024)) );
 #endif
-        Delete_List( (*lists) + THREE_BODIES);
+
+        Delete_List( (*lists) + THREE_BODIES );
 
         if ( num_bonds == -1 )
+        {
             num_bonds = ((*lists) + BONDS)->num_intrs;
+        }
 
         realloc->num_3body = MAX( realloc->num_3body * SAFE_ZONE, MIN_3BODIES );
 
-        if ( !Make_List( num_bonds, realloc->num_3body,
-                         TYP_THREE_BODY, (*lists) + THREE_BODIES) )
-        {
-            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-        }
-        realloc->num_3body = -1;
+        Make_List( num_bonds, realloc->num_3body,
+                TYP_THREE_BODY, (*lists) + THREE_BODIES);
     }
 
     /* grid */
     if ( renbr && realloc->gcell_atoms > -1 )
     {
 #if defined(DEBUG_FOCUS)
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+        fprintf( stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms );
 #endif
+
         for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
+        {
             for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
+            {
                 for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
                 {
-                    // reallocate g->atoms
-                    // SUDHIR
-                    //sfree( g->cells[i][j][k].atoms, "g:atoms" );
-                    sfree( g->cells[ index_grid_3d (i, j, k, g) ].atoms, "g:atoms" );
-                    g->cells[ index_grid_3d (i, j, k, g) ].atoms = (int*)
-                            scalloc(realloc->gcell_atoms, sizeof(int), "g:atoms");
+                    /* reallocate g->atoms */
+                    sfree( g->cells[ index_grid_3d(i, j, k, g) ].atoms, "g:atoms" );
+                    g->cells[ index_grid_3d(i, j, k, g) ].atoms = (int*)
+                            scalloc( realloc->gcell_atoms, sizeof(int), "g:atoms" );
                 }
-        realloc->gcell_atoms = -1;
-    }
-
-    /* mpi buffers */
-    // we have to be at a renbring step -
-    // to ensure correct values at mpi_buffers for update_boundary_positions
-    if ( !renbr )
-        mpi_flag = 0;
-    // check whether in_buffer capacity is enough
-    else if ( system->max_recved >= system->est_recv * 0.90 )
-        mpi_flag = 1;
-    else
-    {
-        // otherwise check individual outgoing buffers
-        mpi_flag = 0;
-        for ( p = 0; p < MAX_NBRS; ++p )
-        {
-            nbr_pr   = &( system->my_nbrs[p] );
-            nbr_data = &( mpi_data->out_buffers[p] );
-            if ( nbr_data->cnt >= nbr_pr->est_send * 0.90 )
-            {
-                mpi_flag = 1;
-                break;
-            }
-        }
-    }
-
-    if ( mpi_flag )
-    {
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating mpi_buf: old_recv=%d\n",
-                 system->my_rank, system->est_recv );
-        for ( p = 0; p < MAX_NBRS; ++p )
-            fprintf( stderr, "p%d: nbr%d old_send=%d\n",
-                     system->my_rank, p, system->my_nbrs[p].est_send );
-#endif
-        /* update mpi buffer estimates based on last comm */
-        system->est_recv = MAX( system->max_recved * SAFER_ZONE, MIN_SEND );
-        system->est_trans =
-            (system->est_recv * sizeof(boundary_atom)) / sizeof(mpi_atom);
-        total_send = 0;
-        for ( p = 0; p < MAX_NBRS; ++p )
-        {
-            nbr_pr   = &( system->my_nbrs[p] );
-            nbr_data = &( mpi_data->out_buffers[p] );
-            nbr_pr->est_send = MAX( nbr_data->cnt * SAFER_ZONE, MIN_SEND );
-            total_send += nbr_pr->est_send;
-        }
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating mpi_buf: recv=%d send=%d total=%dMB\n",
-                 system->my_rank, system->est_recv, total_send,
-                 (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
-                       (1024 * 1024)));
-        for ( p = 0; p < MAX_NBRS; ++p )
-            fprintf( stderr, "p%d: nbr%d new_send=%d\n",
-                     system->my_rank, p, system->my_nbrs[p].est_send );
-#endif
-
-        /* reallocate mpi buffers */
-        Deallocate_MPI_Buffers( mpi_data );
-        ret = Allocate_MPI_Buffers( mpi_data, system->est_recv,
-                                    system->my_nbrs, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "%s", msg );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: reallocate done\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-}
-
-#ifdef HAVE_CUDA
-void Cuda_ReAllocate( reax_system *system, control_params *control,
-                      simulation_data *data, storage *workspace, reax_list **lists,
-                      mpi_datatypes *mpi_data )
-{
-    int i, j, k, p;
-    int num_bonds, est_3body, nflag, Nflag, Hflag, mpi_flag, ret, total_send;
-    int renbr;
-    reallocate_data *realloc;
-    reax_list *far_nbrs;
-    sparse_matrix *H;
-    grid *g;
-    neighbor_proc *nbr_pr;
-    mpi_out_data *nbr_data;
-    char msg[200];
-
-    realloc = &(dev_workspace->realloc);
-    g = &(system->my_grid);
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d@reallocate: n: %d, N: %d, numH: %d\n",
-             system->my_rank, system->n, system->N, system->numH );
-    fprintf( stderr, "p%d@reallocate: local_cap: %d, total_cap: %d, Hcap: %d\n",
-             system->my_rank, system->local_cap, system->total_cap,
-             system->Hcap);
-    fprintf( stderr, "p%d: realloc.num_far: %d\n",
-             system->my_rank, realloc->num_far );
-    fprintf( stderr, "p%d: realloc.H: %d, realloc.Htop: %d\n",
-             system->my_rank, realloc->H, realloc->Htop );
-    fprintf( stderr, "p%d: realloc.Hbonds: %d, realloc.num_hbonds: %d\n",
-             system->my_rank, realloc->hbonds, realloc->num_hbonds );
-    fprintf( stderr, "p%d: realloc.bonds: %d, num_bonds: %d\n",
-             system->my_rank, realloc->bonds, realloc->num_bonds );
-    fprintf( stderr, "p%d: realloc.num_3body: %d\n",
-             system->my_rank, realloc->num_3body );
-#endif
-
-    // IMPORTANT: LOOSE ZONES CHECKS ARE DISABLED FOR NOW BY &&'ing with 0!!!
-    nflag = 0;
-    if ( system->n >= DANGER_ZONE * system->local_cap ||
-            (0 && system->n <= LOOSE_ZONE * system->local_cap) )
-    {
-        nflag = 1;
-        system->local_cap = (int)(system->n * SAFE_ZONE);
-    }
-
-    Nflag = 0;
-    if ( system->N >= DANGER_ZONE * system->total_cap ||
-            (0 && system->N <= LOOSE_ZONE * system->total_cap) )
-    {
-        Nflag = 1;
-        system->total_cap = (int)(system->N * SAFE_ZONE);
-    }
-
-    if ( Nflag )
-    {
-        /* system */
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating system and workspace -"\
-                 "n=%d  N=%d  local_cap=%d  total_cap=%d\n",
-                 system->my_rank, system->n, system->N,
-                 system->local_cap, system->total_cap );
-#endif
-        fprintf (stderr, "p:%d -  *** Allocating System *** \n", system->my_rank);
-        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-        ret = dev_realloc_system ( system, system->local_cap, system->total_cap, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "not enough space for atom_list: total_cap=%d",
-                     system->total_cap );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-
-        /* workspace */
-        dev_dealloc_workspace( control, workspace );
-        ret = dev_alloc_workspace ( system, control, workspace, system->local_cap,
-                                    system->total_cap, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "no space for workspace: local_cap=%d total_cap=%d",
-                     system->local_cap, system->total_cap );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-    }
-
-
-    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
-    /* far neighbors */
-    if ( renbr )
-    {
-        far_nbrs = *dev_lists + FAR_NBRS;
-
-        if ( Nflag || realloc->num_far >= far_nbrs->num_intrs * DANGER_ZONE )
-        {
-            if ( realloc->num_far > far_nbrs->num_intrs )
-            {
-                fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-                         data->step, realloc->num_far, far_nbrs->num_intrs );
-                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
             }
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "p%d: reallocating far_nbrs: num_fars=%d, space=%dMB\n",
-                     system->my_rank, (int)(realloc->num_far * SAFE_ZONE),
-                     (int)(realloc->num_far * SAFE_ZONE * sizeof(far_neighbor_data) /
-                           (1024 * 1024)) );
-#endif
-
-            fprintf (stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank);
-            //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-            //No Need to reindex neighbors - this is taken care in the integrate function
-            Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, realloc->num_far * SAFER_ZONE );
-            realloc->num_far = 0;
-        }
-    }
-
-
-    /* qeq coef matrix */
-    //MATRIX CHANGES
-    H = &dev_workspace->H;
-    //if( nflag || realloc->Htop >= system->max_sparse_entries * DANGER_ZONE ) {
-    //if( realloc->Htop > system->max_sparse_entries ) {
-    if ( nflag || realloc->Htop * DANGER_ZONE >= system->max_sparse_entries )
-    {
-        if ( system->max_sparse_entries > realloc->Htop)
-        {
-            fprintf( stderr,
-                     "step%d - ran out of space on H matrix: Htop=%d, max = %d",
-                     data->step, realloc->Htop, system->max_sparse_entries );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating H matrix: Htop=%d, space=%dMB\n",
-                 system->my_rank, (int)(realloc->Htop * SAFE_ZONE),
-                 (int)(realloc->Htop * SAFE_ZONE * sizeof(sparse_matrix_entry) /
-                       (1024 * 1024)) );
-#endif
-        //Reallocate_Matrix( &(workspace->H),
-        //     system->local_cap, realloc->Htop*SAFE_ZONE, "H" );
-        fprintf (stderr, "p:%d - *** Reallocating Sparse Matrix *** \n", system->my_rank);
-        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-
-        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        dev_dealloc_matrix (&dev_workspace->H);
-        //dev_alloc_matrix ( &(dev_workspace->H), system->total_cap,
-        //     system->N * system->max_sparse_entries);
-
-        //TODO -- MOVER THIS TO CARVER
-        //TODO -- MOVER THIS TO CARVER
-        //TODO -- MOVER THIS TO CARVER
-        system->max_sparse_entries = realloc->Htop * SAFE_ZONE;
-        //TODO -- MOVER THIS TO CARVER
-        //TODO -- MOVER THIS TO CARVER
-        //TODO -- MOVER THIS TO CARVER
-        dev_alloc_matrix ( &(dev_workspace->H), system->total_cap,
-                           system->total_cap * system->max_sparse_entries);
-
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-        //TODO - CARVER FIX
-
-        //Deallocate_Matrix( workspace->L );
-        //Deallocate_Matrix( workspace->U );
-
-        //MATRIX-CHANGES
-        //workspace->L = NULL;
-        //workspace->U = NULL;
-        realloc->Htop = 0;
-    }
-
-    /* hydrogen bonds list */
-    // FIX - 4 - Added additional check here for hydrogen Bond fix
-    if (( control->hbond_cut > 0 )  && (system->numH > 0))
-    {
-
-        if ( Nflag || realloc->hbonds)
-        {
-
-            fprintf (stderr, "p:%d - *** Reallocating Hbonds *** Step:%d\n", system->my_rank, data->step);
-            //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-            ret = Cuda_Reallocate_HBonds_List( system->total_cap, realloc->num_hbonds, (*lists) + HBONDS );
-            realloc->hbonds = 0;
-#if defined(DEBUG_FOCUS)
-            fprintf(stderr, "p%d: reallocating hbonds: total_hbonds=%d space=%dMB\n",
-                    system->my_rank, ret, (int)(ret * sizeof(hbond_data) / (1024 * 1024)));
-#endif
         }
-    }
-
-    /* bonds list */
-    num_bonds = est_3body = -1;
-    if ( Nflag || realloc->bonds )
-    {
-
-        fprintf (stderr, "p:%d - *** Reallocating Bonds *** Step:%d \n", system->my_rank, data->step);
-        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-        Cuda_Reallocate_Bonds_List( system->total_cap, realloc->num_bonds, (*lists) + BONDS);
-        realloc->bonds = 0;
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating bonds: total_bonds=%d, space=%dMB\n",
-                 system->my_rank, num_bonds,
-                 (int)(num_bonds * sizeof(bond_data) / (1024 * 1024)) );
-#endif
-    }
-
-//  /* 3-body list */
-//  if( realloc->num_3body > 0 ) {
-//  #if defined(DEBUG_FOCUS)
-//    fprintf( stderr, "p%d: reallocating 3body list: num_3body=%d, space=%dMB\n",
-//       system->my_rank, realloc->num_3body,
-//       (int)(realloc->num_3body * sizeof(three_body_interaction_data) /
-//         (1024*1024)) );
-//  #endif
-
-//    if( num_bonds == -1 )
-//      num_bonds = ((*lists)+BONDS)->num_intrs;
-
-//    realloc->num_3body = MAX( realloc->num_3body*SAFE_ZONE, MIN_3BODIES );
-//   fprintf (stderr, "p:%d - *** Reallocating Three Body *** \n", system->my_rank);
-//      MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-//   realloc->num_3body = -1;
-//  }
-
-    /* grid */
-    if ( renbr && realloc->gcell_atoms > -1 )
-    {
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
-#endif
-        for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
-            for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
-                for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
-                {
-                    // reallocate g->atoms
-                    // SUDHIR
-                    //sfree( g->cells[i][j][k].atoms, "g:atoms" );
-                    sfree( g->cells[ index_grid_3d (i, j, k, g) ].atoms, "g:atoms" );
-                    g->cells[ index_grid_3d (i, j, k, g) ].atoms = (int*)
-                            scalloc(realloc->gcell_atoms, sizeof(int), "g:atoms");
-                }
-        //TODO
-        //do the same thing for the device here.
-        fprintf (stderr, "p:%d - *** Reallocating Grid Cell Atoms *** Step:%d\n", system->my_rank, data->step);
-        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-
-        //FIX - 1 - Tested the reallocation logic
-        //dev_dealloc_grid_cell_atoms (system);
-        //dev_alloc_grid_cell_atoms (system, realloc->gcell_atoms);
         realloc->gcell_atoms = -1;
     }
 
@@ -1386,35 +921,42 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
     // we have to be at a renbring step -
     // to ensure correct values at mpi_buffers for update_boundary_positions
     if ( !renbr )
-        mpi_flag = 0;
+    {
+        mpi_flag = FALSE;
+    }
     // check whether in_buffer capacity is enough
     else if ( system->max_recved >= system->est_recv * 0.90 )
-        mpi_flag = 1;
+    {
+        mpi_flag = TRUE;
+    }
     else
     {
         // otherwise check individual outgoing buffers
-        mpi_flag = 0;
+        mpi_flag = FALSE;
         for ( p = 0; p < MAX_NBRS; ++p )
         {
-            nbr_pr   = &( system->my_nbrs[p] );
+            nbr_pr = &( system->my_nbrs[p] );
             nbr_data = &( mpi_data->out_buffers[p] );
             if ( nbr_data->cnt >= nbr_pr->est_send * 0.90 )
             {
-                mpi_flag = 1;
+                mpi_flag = TRUE;
                 break;
             }
         }
     }
 
-    if ( mpi_flag )
+    if ( mpi_flag == TRUE )
     {
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating mpi_buf: old_recv=%d\n",
-                 system->my_rank, system->est_recv );
+                system->my_rank, system->est_recv );
         for ( p = 0; p < MAX_NBRS; ++p )
+        {
             fprintf( stderr, "p%d: nbr%d old_send=%d\n",
-                     system->my_rank, p, system->my_nbrs[p].est_send );
+                    system->my_rank, p, system->my_nbrs[p].est_send );
+        }
 #endif
+
         /* update mpi buffer estimates based on last comm */
         system->est_recv = MAX( system->max_recved * SAFER_ZONE, MIN_SEND );
         system->est_trans =
@@ -1422,37 +964,32 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         total_send = 0;
         for ( p = 0; p < MAX_NBRS; ++p )
         {
-            nbr_pr   = &( system->my_nbrs[p] );
+            nbr_pr = &( system->my_nbrs[p] );
             nbr_data = &( mpi_data->out_buffers[p] );
             nbr_pr->est_send = MAX( nbr_data->cnt * SAFER_ZONE, MIN_SEND );
             total_send += nbr_pr->est_send;
         }
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating mpi_buf: recv=%d send=%d total=%dMB\n",
-                 system->my_rank, system->est_recv, total_send,
-                 (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
-                       (1024 * 1024)));
+               system->my_rank, system->est_recv, total_send,
+               (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
+               (1024 * 1024)));
         for ( p = 0; p < MAX_NBRS; ++p )
+        {
             fprintf( stderr, "p%d: nbr%d new_send=%d\n",
-                     system->my_rank, p, system->my_nbrs[p].est_send );
+                    system->my_rank, p, system->my_nbrs[p].est_send );
+        }
 #endif
 
         /* reallocate mpi buffers */
         Deallocate_MPI_Buffers( mpi_data );
-        ret = Allocate_MPI_Buffers( mpi_data, system->est_recv,
-                                    system->my_nbrs, msg );
-        if ( ret != SUCCESS )
-        {
-            fprintf( stderr, "%s", msg );
-            fprintf( stderr, "terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
+        Allocate_MPI_Buffers( mpi_data, system->est_recv, system->my_nbrs, msg );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: reallocate done\n",
-             system->my_rank, data->step );
+            system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
 }
-#endif
diff --git a/PG-PuReMD/src/allocate.h b/PG-PuReMD/src/allocate.h
index 2b81c315a66d7c6788f78a37967664a7f5967a76..a28764532a4ca6a6951705aed4af999f93483e02 100644
--- a/PG-PuReMD/src/allocate.h
+++ b/PG-PuReMD/src/allocate.h
@@ -23,29 +23,39 @@
 #define __ALLOCATE_H_
 
 #include "reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 int PreAllocate_Space( reax_system*, control_params*, storage* );
 
-void reax_atom_Copy( reax_atom*, reax_atom* );
-int  Allocate_System( reax_system*, int, int, char* );
+void Allocate_System( reax_system*, int, int, char* );
 
-int  Allocate_Workspace( reax_system*, control_params*, storage*,
-                         int, int, char* );
+void Allocate_Workspace( reax_system*, control_params*, storage*,
+        int, int, char* );
 
 void Allocate_Grid( reax_system*, MPI_Comm );
+
 void Deallocate_Grid( grid* );
 
-int  Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
+void Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
 
-//int Allocate_Matrix( sparse_matrix**, int, int );
-int Allocate_Matrix( sparse_matrix*, int, int );
+void Allocate_Matrix( sparse_matrix*, int, int );
 
 int Allocate_HBond_List( int, int, int*, int*, reax_list* );
 
 int Allocate_Bond_List( int, int*, reax_list* );
 
+void Deallocate_MPI_Buffers( mpi_datatypes * );
+
 void ReAllocate( reax_system*, control_params*, simulation_data*, storage*,
-                 reax_list**, mpi_datatypes* );
+        reax_list**, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
-void Cuda_ReAllocate( reax_system*, control_params*, simulation_data*, storage*,
-                      reax_list**, mpi_datatypes* );
 #endif
diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c
index 283d7e470f3b0cce6b72d507f6855c9200507212..0f47ba4836065ac4eba00d214a3d9410ff862e11 100644
--- a/PG-PuReMD/src/analyze.c
+++ b/PG-PuReMD/src/analyze.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "analyze.h"
+
 #include "box.h"
 #include "list.h"
 #include "vector.h"
diff --git a/PG-PuReMD/src/analyze.h b/PG-PuReMD/src/analyze.h
index e470334136d710a20843f3920f469591ca7a1892..a772dcb2fb0152359ba9778a2592cba191ed07e8 100644
--- a/PG-PuReMD/src/analyze.h
+++ b/PG-PuReMD/src/analyze.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Analysis( reax_system*, control_params*, simulation_data*, storage*,
-               reax_list**, output_controls*, mpi_datatypes* );
+        reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/basic_comm.c b/PG-PuReMD/src/basic_comm.c
index adb216fafcf2165bb0bce0a203fe62c1ec66f472..cfb43ca7908685de8dc5a4a11dfdaa8b8ef82919 100644
--- a/PG-PuReMD/src/basic_comm.c
+++ b/PG-PuReMD/src/basic_comm.c
@@ -20,12 +20,13 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "basic_comm.h"
-#include "vector.h"
+  #include "basic_comm.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_basic_comm.h"
-#include "reax_vector.h"
+  #include "reax_basic_comm.h"
+  #include "reax_vector.h"
 #endif
 
 
@@ -50,7 +51,7 @@ void rvec_packer( void *dummy, mpi_out_data *out_buf )
 
     for ( i = 0; i < out_buf->cnt; ++i )
     {
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec) );
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec) );
     }
 }
 
@@ -63,7 +64,7 @@ void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 
     for ( i = 0; i < out_buf->cnt; ++i )
     {
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec2) );
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec2) );
     }
 }
 
@@ -81,6 +82,7 @@ void Dist( reax_system* system, mpi_datatypes *mpi_data, void *buf,
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: entered\n", system->my_rank );
 #endif
+
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
 
@@ -154,6 +156,7 @@ void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
     for ( i = 0; i < out_buf->cnt; ++i )
     {
         rvec_Add( buf[ out_buf->index[i] ], in[i] );
+
 #if defined(DEBUG)
         fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
                 out_buf->cnt, i, out_buf->index[i] );
@@ -320,13 +323,13 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace, mpi_datatypes
     if ( system->my_rank == MASTER_NODE )
     {
         workspace->displs[0] = 0;
-        for ( i = 1; i < system->wsize; ++i )
+        for ( i = 1; i < system->nprocs; ++i )
         {
             workspace->displs[i] = workspace->displs[i - 1] + workspace->rcounts[i - 1];
         }
     }
 
-    id_list = (int*) malloc( system->n * sizeof(int) );
+    id_list = (int*) smalloc( system->n * sizeof(int), "Coll_ids_at_Master::id_list" );
     for ( i = 0; i < system->n; ++i )
     {
         id_list[i] = system->my_atoms[i].orig_id;
@@ -336,7 +339,7 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace, mpi_datatypes
             workspace->rcounts, workspace->displs, MPI_INT, MASTER_NODE,
             MPI_COMM_WORLD );
 
-    free( id_list );
+    sfree( id_list, "Coll_ids_at_Master::id_list" );
 
 #if defined(DEBUG)
     if ( system->my_rank == MASTER_NODE )
diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h
index e1effc50db3ed53d33e7da83013cccc2cdbcf8bc..4d8f1c34deb5c29988e87320747f54252d409425 100644
--- a/PG-PuReMD/src/basic_comm.h
+++ b/PG-PuReMD/src/basic_comm.h
@@ -24,33 +24,43 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void real_packer( void*, mpi_out_data* );
+
 void rvec_packer( void*, mpi_out_data* );
+
 void rvec2_packer( void*, mpi_out_data* );
+
 void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
 
 void real_unpacker( void*, void*, mpi_out_data* );
+
 void rvec_unpacker( void*, void*, mpi_out_data* );
+
 void rvec2_unpacker( void*, void*, mpi_out_data* );
+
 void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype,
-           int, coll_unpacker );
+        int, coll_unpacker );
 
 real Parallel_Norm( real*, int, MPI_Comm );
+
 real Parallel_Dot( real*, real*, int, MPI_Comm );
+
 real Parallel_Vector_Acc( real*, int, MPI_Comm );
 
+#if defined(TEST_FORCES)
+void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
+
+void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
+#endif
 
 #ifdef __cplusplus
 }
 #endif
 
-#if defined(TEST_FORCES)
-void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
-void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
-#endif
 
 #endif
diff --git a/PG-PuReMD/src/bond_orders.c b/PG-PuReMD/src/bond_orders.c
index 6906e8e19980e254603e785fa0732b1a57f165fe..da23e0025e58da12db09944e9004f1358cb2d1ed 100644
--- a/PG-PuReMD/src/bond_orders.c
+++ b/PG-PuReMD/src/bond_orders.c
@@ -20,21 +20,24 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "bond_orders.h"
-#include "list.h"
-#include "vector.h"
-#include "io_tools.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "vector.h"
+  #include "io_tools.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_vector.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system *system, reax_list **lists,
-              int i, int pj, real C, rvec *v )
+        int i, int pj, real C, rvec *v )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -45,13 +48,15 @@ void Get_dBO( reax_system *system, reax_list **lists,
     end_pj = End_Index(pj, dBOs);
 
     for ( k = start_pj; k < end_pj; ++k )
+    {
         rvec_Scale( v[dBOs->select.dbo_list[k].wrt],
-                    C, dBOs->select.dbo_list[k].dBO );
+                C, dBOs->select.dbo_list[k].dBO );
+    }
 }
 
 
 void Get_dBOpinpi2( reax_system *system, reax_list **lists,
-                    int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
+        int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -72,7 +77,7 @@ void Get_dBOpinpi2( reax_system *system, reax_list **lists,
 
 
 void Add_dBO( reax_system *system, reax_list **lists,
-              int i, int pj, real C, rvec *v )
+        int i, int pj, real C, rvec *v )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -84,14 +89,16 @@ void Add_dBO( reax_system *system, reax_list **lists,
     //fprintf( stderr, "i=%d j=%d start=%d end=%d\n", i, pj, start_pj, end_pj );
 
     for ( k = start_pj; k < end_pj; ++k )
+    {
         rvec_ScaledAdd( v[dBOs->select.dbo_list[k].wrt],
-                        C, dBOs->select.dbo_list[k].dBO );
+                C, dBOs->select.dbo_list[k].dBO );
+    }
 
 }
 
 
 void Add_dBOpinpi2( reax_system *system, reax_list **lists,
-                    int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
+        int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -112,7 +119,7 @@ void Add_dBOpinpi2( reax_system *system, reax_list **lists,
 
 
 void Add_dBO_to_Forces( reax_system *system, reax_list **lists,
-                        int i, int pj, real C )
+        int i, int pj, real C )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -123,13 +130,15 @@ void Add_dBO_to_Forces( reax_system *system, reax_list **lists,
     end_pj = End_Index(pj, dBOs);
 
     for ( k = start_pj; k < end_pj; ++k )
+    {
         rvec_ScaledAdd( system->my_atoms[dBOs->select.dbo_list[k].wrt].f,
-                        C, dBOs->select.dbo_list[k].dBO );
+                C, dBOs->select.dbo_list[k].dBO );
+    }
 }
 
 
 void Add_dBOpinpi2_to_Forces( reax_system *system, reax_list **lists,
-                              int i, int pj, real Cpi, real Cpi2 )
+        int i, int pj, real Cpi, real Cpi2 )
 {
     reax_list *bonds = (*lists) + BONDS;
     reax_list *dBOs = (*lists) + DBOS;
@@ -157,8 +166,10 @@ void Add_dDelta( reax_system *system, reax_list **lists, int i, real C, rvec *v
     int k;
 
     for ( k = start; k < end; ++k )
+    {
         rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt],
-                        C, dDeltas->select.dDelta_list[k].dVal );
+                C, dDeltas->select.dDelta_list[k].dVal );
+    }
 }
 
 
@@ -171,14 +182,15 @@ void Add_dDelta_to_Forces( reax_system *system, reax_list **lists,
     int k;
 
     for ( k = start; k < end; ++k )
+    {
         rvec_ScaledAdd( system->my_atoms[dDeltas->select.dDelta_list[k].wrt].f,
-                        C, dDeltas->select.dDelta_list[k].dVal );
+                C, dDeltas->select.dDelta_list[k].dVal );
+    }
 }
 
 
 
-void Calculate_dBO( int i, int pj,
-                    storage *workspace, reax_list **lists, int *top )
+void Calculate_dBO( int i, int pj, storage *workspace, reax_list **lists, int *top )
 {
     /* Initializations */
     reax_list *bonds, *dBOs;
@@ -392,9 +404,8 @@ void Calculate_dBO( int i, int pj,
 #endif
 
 
-
 void Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
-                              storage *workspace, reax_list **lists )
+        storage *workspace, reax_list **lists )
 {
     reax_list *bonds = (*lists) + BONDS;
     bond_data *nbr_j, *nbr_k;
@@ -478,7 +489,6 @@ void Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
     rvec_Add( workspace->f[i], temp );
     /* ext pressure due to i is dropped, counting force on j will be enough */
 
-
     /******************************************************
      * forces and pressure related to atom j               *
      * first neighbors of atom j                           *
@@ -548,9 +558,7 @@ void Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
 }
 
 
-
-void Add_dBond_to_Forces( int i, int pj,
-                          storage *workspace, reax_list **lists )
+void Add_dBond_to_Forces( int i, int pj, storage *workspace, reax_list **lists )
 {
     reax_list *bonds = (*lists) + BONDS;
     bond_data *nbr_j, *nbr_k;
@@ -683,21 +691,30 @@ int BOp( storage *workspace, reax_list *bonds, real bo_cut,
         C12 = twbp->p_bo1 * POW( nbr_pj->d / twbp->r_s, twbp->p_bo2 );
         BO_s = (1.0 + bo_cut) * EXP( C12 );
     }
-    else BO_s = C12 = 0.0;
+    else
+    {
+        BO_s = C12 = 0.0;
+    }
 
     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
     {
         C34 = twbp->p_bo3 * POW( nbr_pj->d / twbp->r_p, twbp->p_bo4 );
         BO_pi = EXP( C34 );
     }
-    else BO_pi = C34 = 0.0;
+    else
+    {
+        BO_pi = C34 = 0.0;
+    }
 
     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
     {
         C56 = twbp->p_bo5 * POW( nbr_pj->d / twbp->r_pp, twbp->p_bo6 );
         BO_pi2 = EXP( C56 );
     }
-    else BO_pi2 = C56 = 0.0;
+    else
+    {
+        BO_pi2 = C56 = 0.0;
+    }
 
     /* Initially BO values are the uncorrected ones, page 1 */
     BO = BO_s + BO_pi + BO_pi2;
@@ -794,10 +811,10 @@ int BOp( storage *workspace, reax_list *bonds, real bo_cut,
           bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1],
           bo_ij->dln_BOp_pi2[2] );*/
 
-        return 1;
+        return TRUE;
     }
 
-    return 0;
+    return FALSE;
 }
 
 
@@ -808,7 +825,7 @@ int compare_bonds( const void *p1, const void *p2 )
 
 
 void BO( reax_system *system, control_params *control, simulation_data *data,
-         storage *workspace, reax_list **lists, output_controls *out_control )
+        storage *workspace, reax_list **lists, output_controls *out_control )
 {
     int i, j, pj, type_i, type_j;
     int start_i, end_i, sym_index, num_bonds;
@@ -871,6 +888,7 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
         Deltap_boc_i = workspace->Deltap_boc[i];
         start_i = Start_Index(i, bonds);
         end_i = End_Index(i, bonds);
+
         // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
         //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
 
@@ -881,10 +899,6 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
             bo_ij = &( bonds->select.bond_list[pj].bo_data );
             // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
 
-            //TODO
-            //TODO
-            //TODO
-            //TODO
             //TODO
             //if( i < j || workspace->bond_mark[j] > 3 ) {
             if ( i < j )
@@ -954,7 +968,7 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
                         exp_p2j = EXP( -p_boc2 * Deltap_j );
 
                         f2 = exp_p1i + exp_p1j;
-                        f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                        f3 = -1.0 / p_boc2 * LOG( 0.5 * ( exp_p2i  + exp_p2j ) );
                         f1 = 0.5 * ( ( val_i + f2 ) / ( val_i + f2 + f3 ) +
                                      ( val_j + f2 ) / ( val_j + f2 + f3 ) );
 
@@ -1068,17 +1082,24 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
 
                 /* neglect bonds that are < 1e-10 */
                 if ( bo_ij->BO < 1e-10 )
+                {
                     bo_ij->BO = 0.0;
+                }
                 if ( bo_ij->BO_s < 1e-10 )
+                {
                     bo_ij->BO_s = 0.0;
+                }
                 if ( bo_ij->BO_pi < 1e-10 )
+                {
                     bo_ij->BO_pi = 0.0;
+                }
                 if ( bo_ij->BO_pi2 < 1e-10 )
+                {
                     bo_ij->BO_pi2 = 0.0;
+                }
 
                 workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
 
-
                 /* fprintf( stderr, "%d %d\t%g %g %g %g\n"
                    "Cdbo:\t%g %g %g\n"
                    "Cdbopi:\t%g %g %g %g\n"
@@ -1179,10 +1200,10 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
         workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
         workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
         workspace->Delta_boc[j] = workspace->total_bond_order[j] -
-                                  sbp_j->valency_boc;
+                sbp_j->valency_boc;
 
         workspace->vlpex[j] = workspace->Delta_e[j] -
-                              2.0 * (int)(workspace->Delta_e[j] / 2.0);
+                2.0 * (int)(workspace->Delta_e[j] / 2.0);
         explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
         workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
         workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
@@ -1190,7 +1211,7 @@ void BO( reax_system *system, control_params *control, simulation_data *data,
         /* Adri uses different dDelta_lp values than the ones in notes... */
         workspace->dDelta_lp[j] = workspace->Clp[j];
         //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-        //((fabs(workspace->Delta_e[j]/2.0 -
+        //((FABS(workspace->Delta_e[j]/2.0 -
         //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
 
         if ( sbp_j->mass > 21.0 )
@@ -1250,7 +1271,7 @@ int Locate_Symmetric_Bond( reax_list *bonds, int i, int j )
 }
 
 
-inline void Copy_Bond_Order_Data( bond_order_data *dest, bond_order_data *src )
+static inline void Copy_Bond_Order_Data( bond_order_data *dest, bond_order_data *src )
 {
     dest->BO = src->BO;
     dest->BO_s = src->BO_s;
diff --git a/PG-PuReMD/src/bond_orders.h b/PG-PuReMD/src/bond_orders.h
index 1975e20b6320a003b08527fae665dbd0bbc3c2e4..8cfa2e18715abd0997a1cabf82e044a8a3213bbb 100644
--- a/PG-PuReMD/src/bond_orders.h
+++ b/PG-PuReMD/src/bond_orders.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 typedef struct
 {
     real C1dbo, C2dbo, C3dbo;
@@ -32,28 +33,45 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Get_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Add_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO_to_Forces( reax_system*, reax_list**, int, int, real );
+
 void Add_dBOpinpi2_to_Forces( reax_system*, reax_list**,
-                              int, int, real, real );
+        int, int, real, real );
 
 void Add_dDelta( reax_system*, reax_list**, int, real, rvec* );
+
 void Add_dDelta_to_Forces( reax_system *, reax_list**, int, real );
 #endif
 
 void Add_dBond_to_Forces( int, int, storage*, reax_list** );
-void Add_dBond_to_Forces_NPT( int, int, simulation_data*,
-                              storage*, reax_list** );
-int BOp(storage*, reax_list*, real, int, int, far_neighbor_data*,
-        single_body_parameters*, single_body_parameters*, two_body_parameters*);
+
+void Add_dBond_to_Forces_NPT( int, int, simulation_data*, storage*, reax_list** );
+
+int BOp( storage*, reax_list*, real, int, int, far_neighbor_data*,
+        single_body_parameters*, single_body_parameters*, two_body_parameters* );
+
 void BO( reax_system*, control_params*, simulation_data*,
-         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/bonds.c b/PG-PuReMD/src/bonds.c
index 9c2839eb63e2d722d531914bf6d02136f505d29e..8fb160ecbb0862ba5c7571b89d8106ec5d811678 100644
--- a/PG-PuReMD/src/bonds.c
+++ b/PG-PuReMD/src/bonds.c
@@ -20,25 +20,27 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "bonds.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "bonds.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_bonds.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_bonds.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Bonds( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace, reax_list **lists,
-            output_controls *out_control )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i;
diff --git a/PG-PuReMD/src/bonds.h b/PG-PuReMD/src/bonds.h
index 2aa3c1f93731d35eab6210471c463053c809767f..89090386a8f2044591f2324de56fbb2d4c23c051 100644
--- a/PG-PuReMD/src/bonds.h
+++ b/PG-PuReMD/src/bonds.h
@@ -24,6 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Bonds( reax_system*, control_params*, simulation_data*,
-            storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c
index 7d1782c93fee26b9731ebca60b6ce9d781fad2a7..525f24e5cc43f1b50f7bcdaee912b634e2ecfc71 100644
--- a/PG-PuReMD/src/box.c
+++ b/PG-PuReMD/src/box.c
@@ -19,14 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "box.h"
+
 #include "comm_tools.h"
 #include "io_tools.h"
 #include "system_props.h"
 #include "vector.h"
 
 
-void Make_Consistent(simulation_box* box)
+void Make_Consistent( simulation_box* box )
 {
     real one_vol;
 
@@ -148,10 +151,10 @@ void Setup_Big_Box( real a, real b, real c, real alpha, real beta, real gamma,
         exit( INVALID_INPUT );
     }
 
-    c_alpha = cos(DEG2RAD(alpha));
-    c_beta  = cos(DEG2RAD(beta));
-    c_gamma = cos(DEG2RAD(gamma));
-    s_gamma = sin(DEG2RAD(gamma));
+    c_alpha = COS(DEG2RAD(alpha));
+    c_beta  = COS(DEG2RAD(beta));
+    c_gamma = COS(DEG2RAD(gamma));
+    s_gamma = SIN(DEG2RAD(gamma));
     zi = (c_alpha - c_beta * c_gamma) / s_gamma;
 
     rvec_MakeZero( box->min );
@@ -165,9 +168,10 @@ void Setup_Big_Box( real a, real b, real c, real alpha, real beta, real gamma,
     box->box[2][0] = c * c_beta;
     box->box[2][1] = c * zi;
     box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi));
+
 #if defined(DEBUG)
     fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n",
-             box->box[0][0], box->box[1][1], box->box[2][2] );
+            box->box[0][0], box->box[1][1], box->box[2][2] );
 #endif
 
     Make_Consistent( box );
@@ -211,30 +215,32 @@ void Setup_My_Box( reax_system *system, control_params *control )
 }
 
 
-
 /* setup my extended box -- my box together with the ghost regions */
 void Setup_My_Ext_Box( reax_system *system, control_params *control )
 {
-    int             d;
-    ivec            native_gcells, ghost_gcells;
-    rvec            gcell_len;
+    int d;
+    ivec native_gcells, ghost_gcells;
+    rvec gcell_len;
     simulation_box *big_box, *my_box, *my_ext_box;
     boundary_cutoff *bc;
 
-    big_box    = &(system->big_box);
-    my_box     = &(system->my_box);
+    big_box = &(system->big_box);
+    my_box = &(system->my_box);
     my_ext_box = &(system->my_ext_box);
-    bc         = &(system->bndry_cuts);
+    bc = &(system->bndry_cuts);
     rtensor_MakeZero( my_ext_box->box );
 
     for ( d = 0; d < 3; ++d )
     {
         /* estimate the number of native cells */
         native_gcells[d] = (int)(my_box->box_norms[d] / (control->vlist_cut / 2));
-        if ( native_gcells[d] == 0 ) native_gcells[d] = 1;
+        if ( native_gcells[d] == 0 )
+        {
+            native_gcells[d] = 1;
+        }
 
         gcell_len[d] = my_box->box_norms[d] / native_gcells[d];
-        ghost_gcells[d] = (int) ceil(bc->ghost_cutoff / gcell_len[d]);
+        ghost_gcells[d] = (int) CEIL(bc->ghost_cutoff / gcell_len[d]);
 
         /* extend my box with the ghost regions */
         my_ext_box->min[d] = my_box->min[d] - ghost_gcells[d] * gcell_len[d];
@@ -264,31 +270,34 @@ void Setup_Boundary_Cutoffs( reax_system *system, control_params *control )
     fprintf( stderr, "ghost_hbond: %8.3f\n", bc->ghost_hbond );
     fprintf( stderr, "ghost_bond: %8.3f\n", bc->ghost_bond );
     fprintf( stderr, "ghost_cutoff: %8.3f\n", bc->ghost_cutoff );
-#endif //DEBUG
+#endif
 }
 
 
 void Setup_Environment( reax_system *system, control_params *control,
-                        mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     ivec periodic = {1, 1, 1};
+#if defined(DEBUG_FOCUS)
     char temp[100] = "";
+#endif
 
     /* initialize communicator - 3D mesh with wrap-arounds = 3D torus */
     MPI_Cart_create( MPI_COMM_WORLD, 3, control->procs_by_dim, periodic, 1,
-                     &(mpi_data->comm_mesh3D) );
-    MPI_Comm_rank  ( mpi_data->comm_mesh3D, &(system->my_rank) );
+            &(mpi_data->comm_mesh3D) );
+    MPI_Comm_rank( mpi_data->comm_mesh3D, &(system->my_rank) );
     MPI_Cart_coords( mpi_data->comm_mesh3D, system->my_rank, 3,
-                     system->my_coords );
+            system->my_coords );
 
     Setup_Boundary_Cutoffs( system, control );
     Setup_My_Box( system, control );
     Setup_My_Ext_Box( system, control );
     Setup_Comm( system, control, mpi_data );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d coord: %d %d %d\n",
-             system->my_rank,
-             system->my_coords[0], system->my_coords[1], system->my_coords[2] );
+             system->my_rank, system->my_coords[0],
+             system->my_coords[1], system->my_coords[2] );
     sprintf( temp, "p%d big_box", system->my_rank );
     Print_Box( &(system->big_box), temp, stderr );
     sprintf( temp, "p%d my_box", system->my_rank );
@@ -296,20 +305,19 @@ void Setup_Environment( reax_system *system, control_params *control,
     sprintf( temp, "p%d ext_box", system->my_rank );
     Print_Box( &(system->my_ext_box), temp, stderr );
     MPI_Barrier( MPI_COMM_WORLD );
-#endif
 
-#if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d: parallel environment initialized\n", system->my_rank);
+    fprintf( stderr, "p%d: parallel environment initialized\n",
+            system->my_rank );
 #endif
 }
 
 
 void Scale_Box( reax_system *system, control_params *control,
-                simulation_data *data, mpi_datatypes *mpi_data )
+        simulation_data *data, mpi_datatypes *mpi_data )
 {
     int i, d;
     real dt, lambda;
-    rvec mu;
+    rvec mu = {0.0, 0.0, 0.0};
     reax_atom *atom;
 
     dt = control->dt;
@@ -321,11 +329,16 @@ void Scale_Box( reax_system *system, control_params *control,
                      1. / 3 );
 
         if ( mu[0] < MIN_dV )
+        {
             mu[0] = MIN_dV;
+        }
         else if ( mu[0] > MAX_dV )
+        {
             mu[0] = MAX_dV;
+        }
 
-        mu[2] = mu[1] = mu[0];
+        mu[1] = mu[0];
+        mu[2] = mu[1];
     }
     else if ( control->ensemble == sNPT )
     {
@@ -335,21 +348,28 @@ void Scale_Box( reax_system *system, control_params *control,
                         1. / 3 );
 
             if ( mu[d] < MIN_dV )
+            {
                 mu[d] = MIN_dV;
+            }
             else if ( mu[d] > MAX_dV )
+            {
                 mu[d] = MAX_dV;
+            }
         }
     }
 
     /* temperature scaler */
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
     if ( lambda < MIN_dT )
+    {
         lambda = MIN_dT;
+    }
     else if (lambda > MAX_dT )
+    {
         lambda = MAX_dT;
+    }
     lambda = SQRT( lambda );
 
-
     /* Scale velocities and positions at t+dt */
     for ( i = 0; i < system->n; ++i )
     {
@@ -360,7 +380,10 @@ void Scale_Box( reax_system *system, control_params *control,
         atom->x[2] = mu[2] * atom->x[2];
     }
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    // fprintf( stderr, "damping - " );
+
+#if defined(DEBUG)
+    fprintf( stderr, "damping - " );
+#endif
 
     /* update box & grid */
     system->big_box.box[0][0] *= mu[0];
@@ -387,7 +410,9 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
     {
         tmp = 0.0;
         for ( j = 0; j < 3; j++ )
+        {
             tmp += box->g[i][j] * x2[j];
+        }
         dist += x1[i] * tmp;
     }
 
diff --git a/PG-PuReMD/src/box.h b/PG-PuReMD/src/box.h
index 841e367993662e1ad6f5eaa90c0f6d4659dd756d..fa92c72275d2acfad2258a09cc288e0203a8f5bf 100644
--- a/PG-PuReMD/src/box.h
+++ b/PG-PuReMD/src/box.h
@@ -24,30 +24,53 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Make_Consistent( simulation_box* );
+
 /* initializes simulation boxes */
 void Setup_Big_Box( real, real, real, real, real, real, simulation_box* );
+
 void Init_Box( rtensor, simulation_box* );
-//void Setup_My_Box( reax_system*, control_params* );
-//void Setup_My_Ext_Box( reax_system*, control_params* );
+
+void Setup_My_Box( reax_system*, control_params* );
+
+void Setup_My_Ext_Box( reax_system*, control_params* );
+
 void Setup_Environment( reax_system*, control_params*, mpi_datatypes* );
 
 /* scales simulation box for NPT ensembles */
 void Scale_Box( reax_system*, control_params*,
-                simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
 
 /* applies transformation to/from Cartesian/ Triclinic coordinates */
 /* use -1 flag for Cartesian -> Triclinic and +1 for otherway */
-// void Transform( rvec, simulation_box*, char, rvec );
-// void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
-// void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
-// int Get_Nbr_Box( simulation_box*, int, int, int );
-// rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
-// void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
+//void Transform( rvec, simulation_box*, char, rvec );
+
+//void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
+
+//void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
+
+//int Get_Nbr_Box( simulation_box*, int, int, int );
+
+//rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
+
+//void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
 
 /* these functions assume that the coordinates are in triclinic system
    this function returns cartesian norm but triclinic distance vector */
 //real Sq_Distance_on_T3( rvec, rvec, simulation_box*, rvec );
+
 //void Inc_on_T3( rvec, rvec, simulation_box* );
+
 //real Metric_Product( rvec, rvec, simulation_box* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu
deleted file mode 100644
index 725cafbb7c79e6fc8cb7dcf275c478114a2a09f1..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/center_mass.cu
+++ /dev/null
@@ -1,551 +0,0 @@
-#include "center_mass.h"
-#include "vector.h"
-#include "cuda_shuffle.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm, 
-        rvec *res_vcm, 
-        rvec *res_amcm, 
-        size_t n)
-{
-    extern __shared__ rvec xcm[];
-    extern __shared__ rvec vcm[];
-    extern __shared__ rvec amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    //unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 *(blockDim.x);
-
-    unsigned int index = 0;
-    rvec tmp;
-    real m;
-
-    rvec_MakeZero (xcm [threadIdx.x]);
-    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-    rvec_MakeZero (tmp);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
-        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
-    }
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (xcm [threadIdx.x], xcm[index]);
-            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
-            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
-    }
-}
-
-#if defined( __SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm,
-        size_t n)
-{
-    extern __shared__ rvec my_xcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int index = 0;
-    rvec xcm;
-    real m;
-
-    rvec_MakeZero (xcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm , m, atoms [i].x);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        xcm[0] += shfl( xcm[0], z);
-        xcm[1] += shfl( xcm[1], z);
-        xcm[2] += shfl( xcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_vcm,
-        size_t n)
-{
-    extern __shared__ rvec my_vcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec vcm;
-    real m;
-
-    rvec_MakeZero (vcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (vcm , m, atoms [i].v);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        vcm[0] += shfl( vcm[0], z);
-        vcm[1] += shfl( vcm[1], z);
-        vcm[2] += shfl( vcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec my_amcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec amcm;
-    real m;
-    rvec tmp;
-
-    rvec_MakeZero (amcm);
-    rvec_MakeZero( tmp );
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm, m, tmp);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        amcm[0] += shfl( amcm[0], z);
-        amcm[1] += shfl( amcm[1], z);
-        amcm[2] += shfl( amcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
-    __syncthreads ();
-
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
-    }
-}
-
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm, 
-        rvec *vcm, 
-        rvec *amcm, 
-        rvec *res_xcm,
-        rvec *res_vcm,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec sh_xcm[];
-    extern __shared__ rvec sh_vcm[];
-    extern __shared__ rvec sh_amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 * (blockDim.x);
-
-    unsigned int index = 0;
-    rvec t_xcm, t_vcm, t_amcm;
-
-    rvec_MakeZero (t_xcm);
-    rvec_MakeZero (t_vcm);
-    rvec_MakeZero (t_amcm);
-
-    if (i < n){
-        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
-        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
-        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
-    }
-
-    rvec_Copy (sh_xcm[xcm_id], t_xcm);
-    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
-    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
-
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if (threadIdx.x < offset) {
-            index = threadIdx.x + offset;
-            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
-            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
-            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0){
-        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-        reax_atom *atoms,
-        real *results, 
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx[ xx_i ] = diff[0] * diff[0] * m;
-        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
-        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
-        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
-        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
-        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            xx[ threadIdx.x ] += xx[ index ];
-            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
-        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
-        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
-        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
-        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n)
-    {
-        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
-        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
-        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
-        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
-        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
-        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset )
-        {
-            index = threadIdx.x + offset;
-            xx [ threadIdx.x ] += xx [ index ];
-            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-    {
-        output[0] = xx[0];
-        output[1] = xy[xy_i];
-        output[2] = xz[xz_i];
-        output[3] = xz[yy_i];
-        output[4] = xz[yz_i];
-        output[5] = xz[zz_i];
-    }
-}
-
-#if defined( __SM_35__)
-
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xx[];
-    extern __shared__ real my_results_xy[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xx = 0;
-    real xy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx = diff[0] * diff[0] * m;
-        xy = diff[0] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xx += shfl( xx, z);
-        xy += shfl( xy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xx[threadIdx.x >> 5] = xx;    
-        my_results_xy[threadIdx.x >> 5] = xy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
-            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xz[];
-    extern __shared__ real my_results_yy[];
-
-    unsigned int yy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xz = 0;
-    real yy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xz = diff[0] * diff[2] * m;
-        yy = diff[1] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xz += shfl( xz, z);
-        yy += shfl( yy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xz[threadIdx.x >> 5] = xz;    
-        my_results_yy[threadIdx.x >> 5] = yy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
-            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
-        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_yz[];
-    extern __shared__ real my_results_zz[];
-
-    unsigned int zz_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real yz = 0;
-    real zz = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        yz = diff[1] * diff[2] * m;
-        zz = diff[2] * diff[2] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        yz += shfl( yz, z);
-        zz += shfl( zz, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_yz[threadIdx.x >> 5] = yz;    
-        my_results_zz[threadIdx.x >> 5] = zz;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
-            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
-        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
-    }
-}
-
-#endif
diff --git a/PG-PuReMD/src/center_mass.h b/PG-PuReMD/src/center_mass.h
deleted file mode 100644
index 113971ad3f467b6077783b497e8cf170e63d5318..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/center_mass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#ifndef __CENTER_MASS_H__
-#define __CENTER_MASS_H__
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *,
-                                        rvec *res_xcm,
-                                        rvec *res_vcm,
-                                        rvec *res_amcm,
-                                        size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *, reax_atom *,
-        rvec *res_xcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *, reax_atom *,
-        rvec *res_vcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *, reax_atom *,
-        rvec *res_amcm,
-        size_t n);
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm,
-                                 rvec *vcm,
-                                 rvec *amcm,
-                                 rvec *res_xcm,
-                                 rvec *res_vcm,
-                                 rvec *res_amcm,
-                                 size_t n);
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp,
-                                      reax_atom *atoms,
-                                      real *results,
-                                      real xcm0, real xcm1, real xcm2,
-                                      size_t n);
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/qEq.c b/PG-PuReMD/src/charges.c
similarity index 66%
rename from PG-PuReMD/src/qEq.c
rename to PG-PuReMD/src/charges.c
index 1e87788751c27e04a0330c2a09977dcf028fb2ac..791804f6485ce0837a19a64dacc3ad395b812568 100644
--- a/PG-PuReMD/src/qEq.c
+++ b/PG-PuReMD/src/charges.c
@@ -19,20 +19,16 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "qEq.h"
+#include "reax_types.h"
+
+#include "charges.h"
+
 #include "allocate.h"
 #include "basic_comm.h"
 #include "io_tools.h"
-#include "linear_solvers.h"
+#include "lin_alg.h"
 #include "tool_box.h"
 
-#ifdef HAVE_CUDA
-#include "cuda_qEq.h"
-#include "cuda_linear_solvers.h"
-
-#include "validation.h"
-#endif
-
 
 int compare_matrix_entry(const void *v1, const void *v2)
 {
@@ -106,7 +102,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
         {
             j = A->entries[pj].j;
             val = A->entries[pj].val;
-            if ( fabs(val) > droptol[i] )
+            if ( FABS(val) > droptol[i] )
                 ++fillin;
         }
     }
@@ -123,7 +119,7 @@ void ICHOLT( sparse_matrix *A, real *droptol,
     real val, dval;
     int *Ltop;
 
-    Ltop = (int*) malloc((A->n) * sizeof(int));
+    Ltop = (int*) smalloc( A->n * sizeof(int), "ICHOLT::Ltop" );
 
     // clear data structures
     Utop = 0;
@@ -147,7 +143,7 @@ void ICHOLT( sparse_matrix *A, real *droptol,
             //fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-j, val );
             //fprintf( stdout, "%d %d %24.16f\n", 6540-j, 6540-i, val );
 
-            if ( fabs(val) > droptol[i] )
+            if ( FABS(val) > droptol[i] )
             {
                 k1 = tmptop - 1;
                 k2 = U->start[j] + 1;
@@ -183,7 +179,7 @@ void ICHOLT( sparse_matrix *A, real *droptol,
         //fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-i, dval );
         for ( k1 = 0; k1 < tmptop; ++k1 )
         {
-            //if( fabs(tmp[k1].val) > droptol[i] )
+            //if( FABS(tmp[k1].val) > droptol[i] )
             dval -= SQR(tmp[k1].val);
         }
         dval = SQRT(dval);
@@ -196,7 +192,7 @@ void ICHOLT( sparse_matrix *A, real *droptol,
         for ( k1 = tmptop - 1; k1 >= 0; --k1 )
         {
             // apply the dropping rule once again
-            if ( fabs(tmp[k1].val) > droptol[i] / dval )
+            if ( FABS(tmp[k1].val) > droptol[i] / dval )
             {
                 U->entries[Utop].j = tmp[k1].j;
                 U->entries[Utop].val = tmp[k1].val;
@@ -262,56 +258,58 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-        control_params *control,  storage *workspace, mpi_datatypes *mpi_data )
+        control_params *control, storage *workspace, mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
 
-    /*if( (data->step - data->prev_steps) % control->refactor == 0 ||
-        workspace->L == NULL ) {
-      //Print_Linear_System( system, control, workspace, data->step );
-      Sort_Matrix_Rows( workspace->H );
-      fprintf( stderr, "H matrix sorted\n" );
-      Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
-      fprintf( stderr, "drop tolerances calculated\n" );
-      if( workspace->L == NULL ) {
-        fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
-
-        if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
-      Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
-    fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-    MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-
-        workspace->L->n = workspace->H->n;
-        workspace->U->n = workspace->H->n;
-    #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: n=%d, fillin = %d\n",
-           system->my_rank, workspace->L->n, fillin );
-        fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
-                 system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
-    #endif
-      }
-
-      ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
-    #if defined(DEBUG_FOCUS)
-      fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
-      //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-      //Print_Sparse_Matrix2( workspace->L, fname );
-      //Print_Sparse_Matrix( U );
-    #endif
-    }*/
+//    if( (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 ||
+//            workspace->L == NULL )
+//    {
+////        Print_Linear_System( system, control, workspace, data->step );
+//        Sort_Matrix_Rows( workspace->H );
+//        fprintf( stderr, "H matrix sorted\n" );
+//
+//        Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
+//        fprintf( stderr, "drop tolerances calculated\n" );
+//
+//        if( workspace->L == NULL )
+//        {
+//            fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
+//
+//            if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
+//                    Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 )
+//            {
+//                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
+//                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+//            }
+//
+//            workspace->L->n = workspace->H->n;
+//            workspace->U->n = workspace->H->n;
+//
+//#if defined(DEBUG_FOCUS)
+//            fprintf( stderr, "p%d: n=%d, fillin = %d\n",x
+//                    system->my_rank, workspace->L->n, fillin );
+//            fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
+//                    system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
+//#endif
+//        }
+//
+//      ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+//#if defined(DEBUG_FOCUS)
+//    fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
+////    sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+////    Print_Sparse_Matrix2( workspace->L, fname );
+////    Print_Sparse_Matrix( U );
+//#endif
+//    }
 
     for ( i = 0; i < system->n; ++i )
     {
         atom = &( system->my_atoms[i] );
 
-        /* init pre-conditioner for H and init solution vectors */
+        /* initialize diagonal inverse preconditioner vectors */
         workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
-        workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b_t[i] = -1.0;
-        workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b[i][1] = -1.0;
 
         /* linear extrapolation for s and for t */
         // newQEq: no extrapolation!
@@ -330,7 +328,80 @@ void Init_MatVec( reax_system *system, simulation_data *data,
         workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
         //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
 
-        // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
+//        fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
+    }
+
+    /* initialize solution vectors for linear solves in charge method */
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+                workspace->b_t[i] = -1.0;
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+                workspace->b[i][1] = -1.0;
+            }
+            break;
+
+        case EE_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+            break;
+
+        case ACKS2_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+
+            for ( i = system->n + 1; i < system->N_cm; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = 0.0;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = 0.0;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 }
 
@@ -338,14 +409,14 @@ void Init_MatVec( reax_system *system, simulation_data *data,
 void Calculate_Charges( reax_system *system, storage *workspace,
         mpi_datatypes *mpi_data )
 {
-    int        i, scale;
-    real       u;//, s_sum, t_sum;
-    rvec2      my_sum, all_sum;
+    int i, scale;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
     reax_atom *atom;
     real *q;
 
     scale = sizeof(real) / sizeof(void);
-    q = (real*) malloc(system->N * sizeof(real));
+    q = (real*) smalloc( system->N * sizeof(real), "Calculate_Charges::q" );
 
     //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
     //t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world);
@@ -355,11 +426,19 @@ void Calculate_Charges( reax_system *system, storage *workspace,
         my_sum[0] += workspace->x[i][0];
         my_sum[1] += workspace->x[i][1];
     }
-    fprintf (stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
+
+#if defined(DEBUG)
+    fprintf( stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1] );
+#endif
+
     MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
 
     u = all_sum[0] / all_sum[1];
-    fprintf (stderr, "Host : u: %f \n", u);
+
+#if defined(DEBUG)
+    fprintf( stderr, "Host : u: %f \n", u );
+#endif
+
     for ( i = 0; i < system->n; ++i )
     {
         atom = &( system->my_atoms[i] );
@@ -388,51 +467,18 @@ void Calculate_Charges( reax_system *system, storage *workspace,
         system->my_atoms[i].q = q[i];
     }
 
-    free(q);
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
-                             mpi_datatypes *mpi_data )
-{
-    int        i, scale;
-    real       u;//, s_sum, t_sum;
-    rvec2      my_sum, all_sum;
-    reax_atom *atom;
-    real *q;
-    my_sum [0] = my_sum[1] = 0.0;
-
-    scale = sizeof(real) / sizeof(void);
-    q =  (real *) host_scratch;
-    memset( q, 0, system->N * sizeof (real));
-
-    cuda_charges_x( system, my_sum );
-    //fprintf (stderr, "Device: my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
-
-    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
-
-    u = all_sum[0] / all_sum[1];
-    //fprintf (stderr, "Device: u: %f \n", u);
-
-    cuda_charges_st( system, workspace, q, u );
-
-    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
-
-    cuda_charges_updateq( system, q );
+    sfree( q, "Calculate_Charges::q" );
 }
-#endif
 
 
 void QEq( reax_system *system, control_params *control, simulation_data *data,
-          storage *workspace, output_controls *out_control,
-          mpi_datatypes *mpi_data )
+        storage *workspace, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int s_matvecs, t_matvecs;
 
     Init_MatVec( system, data, control, workspace, mpi_data );
 
-
     //if( data->step == 50010 ) {
     //  Print_Linear_System( system, control, workspace, data->step );
     // }
@@ -443,15 +489,15 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #endif
 
     //MATRIX CHANGES
-    s_matvecs = dual_CG(system, workspace, &workspace->H, workspace->b,
-                        control->q_err, workspace->x, mpi_data, out_control->log, data);
+    s_matvecs = dual_CG( system, workspace, &workspace->H, workspace->b,
+            control->cm_solver_q_err, workspace->x, mpi_data, out_control->log, data );
     t_matvecs = 0;
     //fprintf (stderr, "Host: First CG complated with iterations: %d \n", s_matvecs);
 
     //s_matvecs = CG(system, workspace, workspace->H, workspace->b_s, //newQEq sCG
-    // control->q_err, workspace->s, mpi_data, out_control->log );
+    // control->cm_solver_q_err, workspace->s, mpi_data, out_control->log );
     //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
-    //   control->q_err, workspace->L, workspace->U, workspace->s,
+    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->s,
     //   mpi_data, out_control->log );
 
 #if defined(DEBUG)
@@ -459,67 +505,16 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #endif
 
     //t_matvecs = CG(system, workspace, workspace->H, workspace->b_t, //newQEq sCG
-    // control->q_err, workspace->t, mpi_data, out_control->log );
+    // control->cm_solver_q_err, workspace->t, mpi_data, out_control->log );
     //t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t,
-    //   control->q_err, workspace->L, workspace->U, workspace->t,
+    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->t,
     //   mpi_data, out_control->log );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: second CG completed\n", system->my_rank );
 #endif
 
     Calculate_Charges( system, workspace, mpi_data );
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
-    //Print_Charges( system );
-#endif
-
-#if defined(LOG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.s_matvecs += s_matvecs;
-        data->timing.t_matvecs += t_matvecs;
-    }
-#endif
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_QEq( reax_system *system, control_params *control, simulation_data
-        *data, storage *workspace, output_controls *out_control, mpi_datatypes
-        *mpi_data )
-{
-    int s_matvecs, t_matvecs;
-
-    Cuda_Init_MatVec( system, workspace );
-
-    //if (data->step > 0) {
-    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
-    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
-    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-    //}
-
-//#ifdef __CUDA_DEBUG__
-//  Init_MatVec( system, data, control, workspace, mpi_data );
-//#endif
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
-    //Print_Linear_System( system, control, workspace, data->step );
-#endif
-
-    //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
-            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
-            out_control->log, data);
-    t_matvecs = 0;
-    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
-
-    Cuda_Calculate_Charges( system, workspace, mpi_data );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed charges\n", system->my_rank );
@@ -534,4 +529,3 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
     }
 #endif
 }
-#endif
diff --git a/PG-PuReMD/src/qEq.h b/PG-PuReMD/src/charges.h
similarity index 87%
rename from PG-PuReMD/src/qEq.h
rename to PG-PuReMD/src/charges.h
index 4559b59549540c614a7b0d1ac004fddc489b7c71..08af5641406e9cb7d57d260701dbf6df702e4e47 100644
--- a/PG-PuReMD/src/qEq.h
+++ b/PG-PuReMD/src/charges.h
@@ -19,15 +19,22 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __QEq_H_
-#define __QEq_H_
+#ifndef __CHARGES_H_
+#define __CHARGES_H_
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void QEq( reax_system*, control_params*, simulation_data*,
         storage*, output_controls*, mpi_datatypes* );
 
-void Cuda_QEq( reax_system*, control_params*, simulation_data*,
-        storage*, output_controls*, mpi_datatypes* );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c
index c34d01f868065c016a93dda9c781af31eb16a7ec..9e8978dde4bbcf73bcd689322c2001b0d24f10d5 100644
--- a/PG-PuReMD/src/comm_tools.c
+++ b/PG-PuReMD/src/comm_tools.c
@@ -19,25 +19,46 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "comm_tools.h"
+
 #include "grid.h"
 #include "reset_tools.h"
 #include "tool_box.h"
 #include "vector.h"
 
 
+void Check_MPI_Error( int code, const char * msg )
+{
+    char err_msg[MPI_MAX_ERROR_STRING];
+    int len;
+
+    if ( code != MPI_SUCCESS )
+    {
+        MPI_Error_string( code, err_msg, &len );
+
+        fprintf( stderr, "[ERROR] MPI error code %d, from %s\n",
+                code, msg );
+        fprintf( stderr, "    [INFO] MPI error message: %s\n", err_msg );
+        MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
+    }
+}
+
+
 void Setup_Comm( reax_system* system, control_params* control,
-                 mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int i, d;
     real bndry_cut;
     neighbor_proc *nbr_pr;
     simulation_box *my_box;
     ivec nbr_coords;
-    ivec r[6] = {{ -1, 0, 0}, { +1, 0, 0}, // -x, +x
-        {0, -1, 0}, {0, +1, 0}, // -y, +y
-        {0, 0, -1}, {0, 0, +1}
-    };// -z, +z
+    ivec r[6] = {
+        { -1, 0, 0}, { 1, 0, 0}, // -x, +x
+        {0, -1, 0}, {0, 1, 0}, // -y, +y
+        {0, 0, -1}, {0, 0, 1}, // -z, +z
+    };
     my_box = &(system->my_box);
     bndry_cut = system->bndry_cuts.ghost_cutoff;
 
@@ -70,11 +91,17 @@ void Setup_Comm( reax_system* system, control_params* control,
 
             /* determine if it is a periodic neighbor */
             if ( nbr_coords[d] < 0 )
+            {
                 nbr_pr->prdc[d] = -1;
+            }
             else if ( nbr_coords[d] >= control->procs_by_dim[d] )
+            {
                 nbr_pr->prdc[d] = +1;
+            }
             else
+            {
                 nbr_pr->prdc[d] = 0;
+            }
         }
 
 #if defined(DEBUG_FOCUS)
@@ -92,10 +119,11 @@ void Update_Comm( reax_system* system )
     real bndry_cut;
     neighbor_proc *nbr_pr;
     simulation_box *my_box;
-    ivec r[6] = {{ -1, 0, 0}, { +1, 0, 0}, // -x, +x
-        {0, -1, 0}, {0, +1, 0}, // -y, +y
-        {0, 0, -1}, {0, 0, +1}
-    };// -z, +z
+    ivec r[6] = {
+        { -1, 0, 0}, { 1, 0, 0}, // -x, +x
+        {0, -1, 0}, {0, 1, 0}, // -y, +y
+        {0, 0, -1}, {0, 0, 1}, // -z, +z
+    };
     my_box = &(system->my_box);
     bndry_cut = system->bndry_cuts.ghost_cutoff;
 
@@ -105,6 +133,7 @@ void Update_Comm( reax_system* system )
         nbr_pr = &(system->my_nbrs[i]);
 
         for ( d = 0; d < 3; ++d )
+        {
             /* determine the boundary area with this nbr */
             if ( r[i][d] < 0 )
             {
@@ -120,6 +149,7 @@ void Update_Comm( reax_system* system )
             {
                 nbr_pr->bndry_min[d] = nbr_pr->bndry_max[d] = NEG_INF;
             }
+        }
     }
 }
 
@@ -129,12 +159,12 @@ void Update_Comm( reax_system* system )
 /***************** PACK & UNPACK ATOMS *********************/
 void Pack_MPI_Atom( mpi_atom *matm, reax_atom *ratm, int i )
 {
-    matm->orig_id  = ratm->orig_id;
+    matm->orig_id = ratm->orig_id;
     matm->imprt_id = i;
-    matm->type     = ratm->type;
+    matm->type = ratm->type;
     matm->num_bonds = ratm->num_bonds;
     matm->num_hbonds = ratm->num_hbonds;
-    strcpy( matm->name, ratm->name );
+    strncpy( matm->name, ratm->name, MAX_ATOM_NAME_LEN );
     rvec_Copy( matm->x, ratm->x );
     rvec_Copy( matm->v, ratm->v );
     rvec_Copy( matm->f_old, ratm->f_old );
@@ -145,12 +175,12 @@ void Pack_MPI_Atom( mpi_atom *matm, reax_atom *ratm, int i )
 
 void Unpack_MPI_Atom( reax_atom *ratm, mpi_atom *matm )
 {
-    ratm->orig_id  = matm->orig_id;
+    ratm->orig_id = matm->orig_id;
     ratm->imprt_id = matm->imprt_id;
-    ratm->type     = matm->type;
+    ratm->type = matm->type;
     ratm->num_bonds = matm->num_bonds;
     ratm->num_hbonds = matm->num_hbonds;
-    strcpy( ratm->name, matm->name );
+    strncpy( ratm->name, matm->name, MAX_ATOM_NAME_LEN );
     rvec_Copy( ratm->x, matm->x );
     rvec_Copy( ratm->v, matm->v );
     rvec_Copy( ratm->f_old, matm->f_old );
@@ -161,7 +191,7 @@ void Unpack_MPI_Atom( reax_atom *ratm, mpi_atom *matm )
 
 /*********************** SORTER **************************/
 void Sort_Transfer_Atoms( reax_system *system, int start, int end,
-                          int dim, mpi_out_data *out_bufs )
+        int dim, mpi_out_data *out_bufs )
 {
     int i, d, out_cnt;
     reax_atom *atoms;
@@ -170,8 +200,9 @@ void Sort_Transfer_Atoms( reax_system *system, int start, int end,
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d sort_transfers: start=%d end=%d dim=%d starting...\n",
-             system->my_rank, start, end, dim );
+            system->my_rank, start, end, dim );
 #endif
+
     atoms = system->my_atoms;
     my_box = &( system->my_box );
 
@@ -179,6 +210,7 @@ void Sort_Transfer_Atoms( reax_system *system, int start, int end,
     for ( i = start; i < end; ++i )
     {
         for ( d = dim; d < 3; ++d )
+        {
             if ( atoms[i].x[d] < my_box->min[d] )
             {
                 out_cnt = out_bufs[2 * d].cnt++;
@@ -195,29 +227,32 @@ void Sort_Transfer_Atoms( reax_system *system, int start, int end,
                 atoms[i].orig_id = -1;
                 break;
             }
+        }
     }
 
 #if defined(DEBUG_FOCUS)
     for ( d = 2 * dim; d < 2 * dim + 2; ++d )
+    {
         if ( out_bufs[d].cnt )
         {
             fprintf( stderr, "p%d to p%d(nbr%d) # of transfers = %d\n",
-                     system->my_rank, system->my_nbrs[d].rank, d, out_bufs[d].cnt );
+                    system->my_rank, system->my_nbrs[d].rank, d, out_bufs[d].cnt );
             out_buf = out_bufs[d].out_atoms;
             for ( i = 0; i < out_bufs[d].cnt; ++i )
                 fprintf( stderr, "p%d to p%d: transfer atom%d [%.3f %.3f %.3f]\n",
-                         system->my_rank, system->my_nbrs[d].rank, out_buf[i].imprt_id,
-                         out_buf[i].x[0],  out_buf[i].x[1],  out_buf[i].x[2] );
+                        system->my_rank, system->my_nbrs[d].rank, out_buf[i].imprt_id,
+                        out_buf[i].x[0],  out_buf[i].x[1],  out_buf[i].x[2] );
         }
     //fprintf( stderr, "p%d sort_transfers: start=%d end=%d dim=%d done!\n",
     //   system->my_rank, start, end, dim );
+    }
 #endif
 }
 
 
 /*********************** UNPACKER **************************/
 void Unpack_Transfer_Message( reax_system *system, int end, void *dummy,
-                              int cnt, neighbor_proc *nbr, int dim )
+        int cnt, neighbor_proc *nbr, int dim )
 {
     int i;
     real dx;
@@ -226,14 +261,18 @@ void Unpack_Transfer_Message( reax_system *system, int end, void *dummy,
 
     dest = system->my_atoms + end;
     for ( i = 0; i < cnt; ++i )
+    {
         Unpack_MPI_Atom( dest + i, src + i );
+    }
 
     /* adjust coordinates of recved atoms if nbr is a periodic one */
     if ( nbr->prdc[dim] )
     {
         dx = nbr->prdc[dim] * system->big_box.box_norms[dim];
         for ( i = 0; i < cnt; ++i )
+        {
             dest[i].x[dim] += dx;
+        }
     }
 }
 
@@ -243,9 +282,9 @@ void Unpack_Transfer_Message( reax_system *system, int end, void *dummy,
 /************ PACK & UNPACK BOUNDARY ATOMS **************/
 void Pack_Boundary_Atom( boundary_atom *matm, reax_atom *ratm, int i )
 {
-    matm->orig_id  = ratm->orig_id;
+    matm->orig_id = ratm->orig_id;
     matm->imprt_id = i;
-    matm->type     = ratm->type;
+    matm->type = ratm->type;
     matm->num_bonds = ratm->num_bonds;
     matm->num_hbonds = ratm->num_hbonds;
     rvec_Copy( matm->x, ratm->x );
@@ -266,24 +305,25 @@ void Unpack_Boundary_Atom( reax_atom *ratm, boundary_atom *matm )
 
 /*********************** SORTER **************************/
 void Sort_Boundary_Atoms( reax_system *system, int start, int end,
-                          int dim, mpi_out_data *out_bufs )
+        int dim, mpi_out_data *out_bufs )
 {
     int i, d, p, out_cnt;
     reax_atom *atoms;
-    simulation_box *my_box;
     boundary_atom *out_buf;
     neighbor_proc *nbr_pr;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d sort_exchange: start=%d end=%d dim=%d starting...\n",
-             system->my_rank, start, end, dim );
+            system->my_rank, start, end, dim );
 #endif
+
     atoms = system->my_atoms;
-    my_box = &( system->my_box );
 
     /* place each atom into the appropriate outgoing list */
     for ( i = start; i < end; ++i )
+    {
         for ( d = dim; d < 3; ++d )
+        {
             for ( p = 2 * d; p < 2 * d + 2; ++p )
             {
                 nbr_pr = &( system->my_nbrs[p] );
@@ -296,12 +336,16 @@ void Sort_Boundary_Atoms( reax_system *system, int start, int end,
                     Pack_Boundary_Atom( out_buf + out_cnt, atoms + i, i );
                 }
             }
+        }
+    }
 
 #if defined(DEBUG_FOCUS)
     for ( i = 2 * dim; i < 2 * dim + 2; ++i )
+    {
         fprintf( stderr, "p%d to p%d(nbr%d) # of exchanges to send = %d\n",
                  system->my_rank, system->my_nbrs[i].rank, i,
                  out_bufs[i].cnt );
+    }
     fprintf( stderr, "p%d sort_exchange: start=%d end=%d dim=%d done!\n",
              system->my_rank, start, end, dim );
 #endif
@@ -309,20 +353,19 @@ void Sort_Boundary_Atoms( reax_system *system, int start, int end,
 
 
 void Estimate_Boundary_Atoms( reax_system *system, int start, int end,
-                              int d, mpi_out_data *out_bufs )
+        int d, mpi_out_data *out_bufs )
 {
     int i, p, out_cnt;
     reax_atom *atoms;
-    simulation_box *my_box;
     boundary_atom *out_buf;
     neighbor_proc *nbr1, *nbr2, *nbr_pr;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d estimate_exchange: start=%d end=%d dim=%d starting.\n",
-             system->my_rank, start, end, d );
+            system->my_rank, start, end, d );
 #endif
+
     atoms = system->my_atoms;
-    my_box = &( system->my_box );
     nbr1 = &(system->my_nbrs[2 * d]);
     nbr2 = &(system->my_nbrs[2 * d + 1]);
     nbr1->est_send = 0;
@@ -332,17 +375,22 @@ void Estimate_Boundary_Atoms( reax_system *system, int start, int end,
     for ( i = 0; i < end; ++i )
     {
         if ( nbr1->bndry_min[d] <= atoms[i].x[d] && atoms[i].x[d] < nbr1->bndry_max[d] )
+        {
             nbr1->est_send++;
+        }
         if ( nbr2->bndry_min[d] <= atoms[i].x[d] && atoms[i].x[d] < nbr2->bndry_max[d] )
+        {
             nbr2->est_send++;
+        }
     }
 
     /* estimate the space based on the count above */
     nbr1->est_send = MAX( MIN_SEND, nbr1->est_send * SAFER_ZONE );
     nbr2->est_send = MAX( MIN_SEND, nbr2->est_send * SAFER_ZONE );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d estimate_exchange: end=%d dim=%d est1=%d est2=%d\n",
-             system->my_rank, end, d, nbr1->est_send, nbr2->est_send );
+            system->my_rank, end, d, nbr1->est_send, nbr2->est_send );
 #endif
 
     /* allocate the estimated space */
@@ -350,51 +398,77 @@ void Estimate_Boundary_Atoms( reax_system *system, int start, int end,
     {
         nbr_pr = &( system->my_nbrs[p] );
         out_bufs[p].index = (int*)
-                            scalloc( nbr_pr->est_send, sizeof(int), "mpibuf:index" );
+            scalloc( nbr_pr->est_send, sizeof(int), "mpibuf:index" );
         out_bufs[p].out_atoms = (void*)
-                                scalloc( nbr_pr->est_send, sizeof(boundary_atom), "mpibuf:out_atoms" );
+            scalloc( nbr_pr->est_send, sizeof(boundary_atom), "mpibuf:out_atoms" );
     }
 
     /* sort the atoms to their outgoing buffers */
     for ( i = 0; i < end; ++i )
+    {
+        /* check if atom is outbound to another processor
+         * in either direction of the dimension under consideration */
         for ( p = 2 * d; p < 2 * d + 2; ++p )
         {
             nbr_pr = &( system->my_nbrs[p] );
             if ( nbr_pr->bndry_min[d] <= atoms[i].x[d] &&
                     atoms[i].x[d] < nbr_pr->bndry_max[d] )
             {
-                out_cnt = out_bufs[p].cnt++;
+                out_cnt = out_bufs[p].cnt;
                 out_bufs[p].index[out_cnt] = i;
                 out_buf = (boundary_atom *)out_bufs[p].out_atoms;
                 Pack_Boundary_Atom( out_buf + out_cnt, atoms + i, i );
+                ++out_bufs[p].cnt;
             }
         }
+    }
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d estimate_exchange: end=%d dim=%d done!\n",
-             system->my_rank, end, d );
+    for ( p = 2 * d; p < 2 * d + 2; ++p )
+    {
+        for ( i = 0; i < out_bufs[p].cnt; ++i )
+        {
+            fprintf( stderr, "p%d: out_bufs[%d].index[%d] = %d\n",
+                    system->my_rank, p, i, out_bufs[p].index[i] );
+            fprintf( stderr, "  p%d: atom %6d, x[0] = %10.4f, x[1] = %10.4f, x[2] = %10.4f\n",
+                    system->my_rank,
+                    ((boundary_atom *)(out_bufs[p].out_atoms))[i].orig_id,
+                    ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[0],
+                    ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[1],
+                    ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[2] );
+        }
+    }
+
+fprintf( stderr, "p%d estimate_exchange: end=%d dim=%d done!\n",
+         system->my_rank, end, d );
 #endif
 }
 
 
 void Estimate_Init_Storage( int me, neighbor_proc *nbr1, neighbor_proc *nbr2,
-                            int d, int *max, int *nrecv,
-                            void **in1, void **in2, MPI_Comm comm )
+        int d, int *max, int *nrecv, void **in1, void **in2, MPI_Comm comm )
 {
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
-    int new_max;
+    int new_max, ret;
 
     /* first exchange the estimates, then allocate buffers */
-    MPI_Irecv( &nbr1->est_recv, 1, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
-    MPI_Irecv( &nbr2->est_recv, 1, MPI_INT, nbr2->rank, 2 * d, comm, &req2 );
-    MPI_Send( &nbr1->est_send, 1, MPI_INT, nbr1->rank, 2 * d, comm );
-    MPI_Send( &nbr2->est_send, 1, MPI_INT, nbr2->rank, 2 * d + 1, comm );
-    MPI_Wait( &req1, &stat1 );
-    MPI_Wait( &req2, &stat2 );
+    ret = MPI_Irecv( &nbr1->est_recv, 1, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Irecv::nbr1" );
+    ret = MPI_Irecv( &nbr2->est_recv, 1, MPI_INT, nbr2->rank, 2 * d, comm, &req2 );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Irecv::nbr2" );
+    ret = MPI_Send( &nbr1->est_send, 1, MPI_INT, nbr1->rank, 2 * d, comm );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Send::nbr1" );
+    ret = MPI_Send( &nbr2->est_send, 1, MPI_INT, nbr2->rank, 2 * d + 1, comm );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Send::nbr2" );
+    ret = MPI_Wait( &req1, &stat1 );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Wait::nbr1" );
+    ret = MPI_Wait( &req2, &stat2 );
+    Check_MPI_Error( ret, "Estimate_Init_Storage::MPI_Wait::nbr2" );
     nrecv[2 * d] = nbr1->est_recv;
     nrecv[2 * d + 1] = nbr2->est_recv;
     new_max = MAX( nbr1->est_recv, nbr2->est_recv );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d-p%d(nbr%d) est_send=%d est_recv=%d\n",
              me, nbr1->rank, 2 * d, nbr1->est_send, nbr1->est_recv );
@@ -406,8 +480,14 @@ void Estimate_Init_Storage( int me, neighbor_proc *nbr1, neighbor_proc *nbr2,
     if ( new_max > *max )
     {
         *max = new_max;
-        if (*in1) sfree( *in1, "in1" );
-        if (*in2) sfree( *in2, "in2" );
+        if ( *in1 != NULL )
+        {
+            sfree( *in1, "in1" );
+        }
+        if ( *in2 != NULL )
+        {
+            sfree( *in2, "in2" );
+        }
         *in1 = (void *) smalloc( new_max * sizeof(boundary_atom), "in1" );
         *in2 = (void *) smalloc( new_max * sizeof(boundary_atom), "in2" );
     }
@@ -416,7 +496,7 @@ void Estimate_Init_Storage( int me, neighbor_proc *nbr1, neighbor_proc *nbr2,
 
 /*********************** UNPACKER **************************/
 void Unpack_Exchange_Message( reax_system *system, int end, void *dummy,
-                              int cnt, neighbor_proc *nbr, int dim )
+        int cnt, neighbor_proc *nbr, int dim )
 {
     int i;
     real dx;
@@ -425,30 +505,52 @@ void Unpack_Exchange_Message( reax_system *system, int end, void *dummy,
 
     dest = system->my_atoms + end;
     for ( i = 0; i < cnt; ++i )
+    {
         Unpack_Boundary_Atom( dest + i, src + i );
+    }
+
+#if defined(DEBUG_FOCUS)
+    for ( i = end; i < end + cnt; ++i )
+    {
+        fprintf( stderr, "UNPACK p%d: d = %d, atom %d, x[0] = %10.4f, x[1] = %10.4f, x[2] = %10.4f\n",
+              system->my_rank, dim, i,
+              system->my_atoms[i].x[0],
+              system->my_atoms[i].x[1],
+              system->my_atoms[i].x[2] );
+    }
+#endif
 
     /* record the atoms recv'd from this nbr */
     nbr->atoms_str = end;
     nbr->atoms_cnt = cnt;
     /* update est_recv */
-    nbr->est_recv = MAX( cnt * SAFER_ZONE, MIN_SEND );
+    nbr->est_recv = MAX( (int)(cnt * SAFER_ZONE), MIN_SEND );
 
     /* update max_recv to make sure that we reallocate at the right time */
     if ( cnt > system->max_recved )
+    {
         system->max_recved = cnt;
+    }
 
     /* adjust coordinates of recved atoms if nbr is a periodic one */
     if ( nbr->prdc[dim] )
     {
         dx = nbr->prdc[dim] * system->big_box.box_norms[dim];
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "UNPACK p%d: dim = %d, dx = %f\n",
+                    system->my_rank, dim, dx );
+#endif
+
         for ( i = 0; i < cnt; ++i )
+        {
             dest[i].x[dim] += dx;
+        }
     }
 }
 
 
 void Unpack_Estimate_Message( reax_system *system, int end, void *dummy,
-                              int cnt, neighbor_proc *nbr, int dim )
+        int cnt, neighbor_proc *nbr, int dim )
 {
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d-p%d unpack_estimate: end=%d cnt=%d - unpacking\n",
@@ -456,7 +558,7 @@ void Unpack_Estimate_Message( reax_system *system, int end, void *dummy,
 #endif
 
     system->my_atoms = (reax_atom*)
-                       realloc( system->my_atoms, (end + cnt) * sizeof(reax_atom) );
+            srealloc( system->my_atoms, (end + cnt) * sizeof(reax_atom), "system:my_atoms" );
 
     Unpack_Exchange_Message( system, end, dummy, cnt, nbr, dim );
 
@@ -471,7 +573,7 @@ void Unpack_Estimate_Message( reax_system *system, int end, void *dummy,
 
 /**************** PACK POSITION UPDATES *******************/
 void Sort_Position_Updates( reax_system *system, int start, int end,
-                            int dim, mpi_out_data *out_bufs )
+        int dim, mpi_out_data *out_bufs )
 {
     int i, p;
     reax_atom *atoms;
@@ -483,13 +585,15 @@ void Sort_Position_Updates( reax_system *system, int start, int end,
     {
         out = (rvec*) out_bufs[p].out_atoms;
         for ( i = 0; i < out_bufs[p].cnt; ++i )
+        {
             memcpy( out[i], atoms[ out_bufs[p].index[i] ].x, sizeof(rvec) );
+        }
     }
 }
 
 /*************** UNPACK POSITION UPDATES ******************/
 void Unpack_Position_Updates( reax_system *system, int end, void *dummy,
-                              int cnt, neighbor_proc *nbr, int dim )
+        int cnt, neighbor_proc *nbr, int dim )
 {
     int i, start;
     reax_atom *atoms;
@@ -500,33 +604,44 @@ void Unpack_Position_Updates( reax_system *system, int end, void *dummy,
     start = nbr->atoms_str;
 
     for ( i = 0; i < cnt; ++i )
+    {
         memcpy( atoms[start + i].x, src[i], sizeof(rvec) );
+    }
 
     /* adjust coordinates of recved atoms if nbr is a periodic one */
     if ( nbr->prdc[dim] )
     {
         dx = nbr->prdc[dim] * system->big_box.box_norms[dim];
         for ( i = 0; i < cnt; ++i )
+        {
             atoms[start + i].x[dim] += dx;
+        }
     }
 }
 
 
 int SendRecv( reax_system* system, mpi_datatypes *mpi_data, MPI_Datatype type,
-              int* nrecv, message_sorter sort_func, unpacker unpack, int clr )
+        int* nrecv, message_sorter sort_func, unpacker unpack, int clr )
 {
-    int d, cnt, start, end, max, est_flag;
+    int d, cnt, start, end, max, est_flag, ret;
     mpi_out_data *out_bufs;
     void *in1, *in2;
     MPI_Comm comm;
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
     neighbor_proc *nbr1, *nbr2;
+#if defined(DEBUG_FOCUS)
+    int i, p;
+#endif
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d sendrecv: entered\n", system->my_rank );
 #endif
-    if ( clr ) Reset_Out_Buffers( mpi_data->out_buffers, system->num_nbrs );
+
+    if ( clr == TRUE )
+    {
+        Reset_Out_Buffers( mpi_data->out_buffers, system->num_nbrs );
+    }
     comm = mpi_data->comm_mesh3D;
     in1 = mpi_data->in1_buffer;
     in2 = mpi_data->in2_buffer;
@@ -534,7 +649,8 @@ int SendRecv( reax_system* system, mpi_datatypes *mpi_data, MPI_Datatype type,
     start = 0;
     end = system->n;
     max = 0;
-    est_flag = (mpi_data->in1_buffer == NULL) || (mpi_data->in2_buffer == NULL);
+    est_flag = (mpi_data->in1_buffer == NULL) || (mpi_data->in2_buffer == NULL) ?
+        TRUE : FALSE;
 
     for ( d = 0; d < 3; ++d )
     {
@@ -544,34 +660,69 @@ int SendRecv( reax_system* system, mpi_datatypes *mpi_data, MPI_Datatype type,
         nbr2 = &(system->my_nbrs[2 * d + 1]);
 
         /* for estimates in1_buffer & in2_buffer will be NULL */
-        if ( est_flag )
+        if ( est_flag == TRUE )
+        {
             Estimate_Init_Storage( system->my_rank, nbr1, nbr2, d,
-                                   &max, nrecv, &in1, &in2, comm );
+                    &max, nrecv, &in1, &in2, comm );
+        }
+
+#if defined(DEBUG_FOCUS)
+        for ( p = 2 * d; p < 2 * d + 2; ++p )
+        {
+            for ( i = 0; i < out_bufs[p].cnt; ++i )
+            {
+                fprintf( stderr, "p%d: out_bufs[%d].index[%d] = %d\n",
+                        system->my_rank, p, i, out_bufs[p].index[i] );
+                fprintf( stderr, "  p%d: atom %6d, x[0] = %10.4f, x[1] = %10.4f, x[2] = %10.4f\n",
+                        system->my_rank,
+                        ((boundary_atom *)(out_bufs[p].out_atoms))[i].orig_id,
+                        ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[0],
+                        ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[1],
+                        ((boundary_atom *)(out_bufs[p].out_atoms))[i].x[2] );
+            }
+        }
+#endif
 
         /* initiate recvs */
-        MPI_Irecv( in1, nrecv[2 * d], type, nbr1->rank, 2 * d + 1, comm, &req1 );
-        MPI_Irecv( in2, nrecv[2 * d + 1], type, nbr2->rank, 2 * d, comm, &req2 );
+        ret = MPI_Irecv( in1, nrecv[2 * d], type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        Check_MPI_Error( ret, "SendRecv::MPI_Irecv::nbr1" );
+        ret = MPI_Irecv( in2, nrecv[2 * d + 1], type, nbr2->rank, 2 * d, comm, &req2 );
+        Check_MPI_Error( ret, "SendRecv::MPI_Irecv::nbr2" );
 
         /* send both messages in dimension d */
-        MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
-                  nbr1->rank, 2 * d, comm );
-        MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
-                  nbr2->rank, 2 * d + 1, comm );
+        ret = MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
+                nbr1->rank, 2 * d, comm );
+        Check_MPI_Error( ret, "SendRecv::MPI_Send::nbr1" );
+        ret = MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
+                nbr2->rank, 2 * d + 1, comm );
+        Check_MPI_Error( ret, "SendRecv::MPI_Send::nbr2" );
 
         /* recv and unpack atoms from nbr1 in dimension d */
-        MPI_Wait( &req1, &stat1 );
-        MPI_Get_count( &stat1, type, &cnt );
+        ret = MPI_Wait( &req1, &stat1 );
+        Check_MPI_Error( ret, "SendRecv::MPI_Wait::nbr1" );
+        ret = MPI_Get_count( &stat1, type, &cnt );
+        Check_MPI_Error( ret, "SendRecv::MPI_Count::nbr1" );
         unpack( system, end, in1, cnt, nbr1, d );
         end += cnt;
 
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: nbr1: d = %d, end = %d\n", system->my_rank, d, end );
+#endif
+
         /* recv and unpack atoms from nbr2 in dimension d */
-        MPI_Wait( &req2, &stat2 );
-        MPI_Get_count( &stat2, type, &cnt );
+        ret = MPI_Wait( &req2, &stat2 );
+        Check_MPI_Error( ret, "SendRecv::MPI_Wait::nbr2" );
+        ret = MPI_Get_count( &stat2, type, &cnt );
+        Check_MPI_Error( ret, "SendRecv::MPI_Count::nbr2" );
         unpack( system, end, in2, cnt, nbr2, d );
         end += cnt;
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: nbr2: d = %d, end = %d\n", system->my_rank, d, end );
+#endif
     }
 
-    if ( est_flag )
+    if ( est_flag == TRUE )
     {
         system->est_recv = max;
         system->est_trans = (max * sizeof(boundary_atom)) / sizeof(mpi_atom);
@@ -584,51 +735,71 @@ int SendRecv( reax_system* system, mpi_datatypes *mpi_data, MPI_Datatype type,
 
 
 void Comm_Atoms( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, int renbr )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, int renbr )
 {
     int i;
     int nrecv[MAX_NBRS];
+
 #if defined(LOG_PERFORMANCE)
     real t_start = 0, t_elapsed = 0;
 
     if ( system->my_rank == MASTER_NODE )
+    {
         t_start = Get_Time( );
+    }
 #endif
 
-    if ( renbr )
+    if ( renbr == TRUE )
     {
-        for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = system->est_trans;
+        /* transfer ownership of atoms */
+        for ( i = 0; i < MAX_NBRS; ++i )
+        {
+            nrecv[i] = system->est_trans;
+        }
         system->n = SendRecv( system, mpi_data, mpi_data->mpi_atom_type, nrecv,
-                              Sort_Transfer_Atoms, Unpack_Transfer_Message, 1 );
+                Sort_Transfer_Atoms, Unpack_Transfer_Message, TRUE );
         Bin_My_Atoms( system, &(workspace->realloc) );
         Reorder_My_Atoms( system, workspace );
+
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d updated local atoms, n=%d\n",
-                 system->my_rank, system->n );
+        fprintf( stderr, "p%d, step %d: updated local atoms, n=%d\n",
+                 system->my_rank, data->step, system->n );
         MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-        for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = system->my_nbrs[i].est_recv;
-        system->N = SendRecv(system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-                             Sort_Boundary_Atoms, Unpack_Exchange_Message, 1);
+        /* exchange ghost region info with neighbors */
+        for ( i = 0; i < MAX_NBRS; ++i )
+        {
+            nrecv[i] = system->my_nbrs[i].est_recv;
+        }
+        system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
+                Sort_Boundary_Atoms, Unpack_Exchange_Message, TRUE );
+
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: exchanged boundary atoms, N=%d\n",
-                 system->my_rank, system->N );
+        fprintf( stderr, "p%d, step %d: exchanged boundary atoms, N=%d\n",
+                 system->my_rank, data->step, system->N );
         for ( i = 0; i < MAX_NBRS; ++i )
+        {
             fprintf( stderr, "p%d: nbr%d(p%d) str=%d cnt=%d end=%d\n",
-                     system->my_rank, i, system->my_nbrs[i].rank,
-                     system->my_nbrs[i].atoms_str,  system->my_nbrs[i].atoms_cnt,
-                     system->my_nbrs[i].atoms_str + system->my_nbrs[i].atoms_cnt );
+                    system->my_rank, i, system->my_nbrs[i].rank,
+                    system->my_nbrs[i].atoms_str,  system->my_nbrs[i].atoms_cnt,
+                    system->my_nbrs[i].atoms_str + system->my_nbrs[i].atoms_cnt );
+        }
         MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
         Bin_Boundary_Atoms( system );
     }
     else
     {
-        for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = system->my_nbrs[i].atoms_cnt;
+        for ( i = 0; i < MAX_NBRS; ++i )
+        {
+            nrecv[i] = system->my_nbrs[i].atoms_cnt;
+        }
         SendRecv( system, mpi_data, mpi_data->mpi_rvec, nrecv,
-                  Sort_Position_Updates, Unpack_Position_Updates, 0 );
+                Sort_Position_Updates, Unpack_Position_Updates, FALSE );
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: updated positions\n", system->my_rank );
         MPI_Barrier( MPI_COMM_WORLD );
@@ -642,8 +813,10 @@ void Comm_Atoms( reax_system *system, control_params *control,
         data->timing.comm += t_elapsed;
     }
 #endif
+
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ renbr=%d: comm_atoms done\n", system->my_rank, renbr);
+    fprintf( stderr, "p%d @ renbr=%d: comm_atoms done\n",
+            system->my_rank, renbr );
     fprintf( stderr, "p%d: system->n = %d, system->N = %d\n",
              system->my_rank, system->n, system->N );
     //Print_My_Ext_Atoms( system );
diff --git a/PG-PuReMD/src/comm_tools.h b/PG-PuReMD/src/comm_tools.h
index 48b676ebbe67cbd8fb17af717fd7da5eac96ffc8..a0e8d7e5428f193c96ecf15debc544baf48f20ef 100644
--- a/PG-PuReMD/src/comm_tools.h
+++ b/PG-PuReMD/src/comm_tools.h
@@ -24,20 +24,36 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Check_MPI_Error( int, const char * );
+
 void Setup_Comm( reax_system*, control_params*, mpi_datatypes* );
+
 void Update_Comm( reax_system* );
 
 void Sort_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Estimate_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Unpack_Exchange_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
+
 void Unpack_Estimate_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
 
 int SendRecv( reax_system*, mpi_datatypes*_data, MPI_Datatype, int*,
-              message_sorter, unpacker, int );
+        message_sorter, unpacker, int );
 
 void Comm_Atoms( reax_system*, control_params*, simulation_data*, storage*,
-                 reax_list**, mpi_datatypes*, int );
+        reax_list**, mpi_datatypes*, int );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/control.c b/PG-PuReMD/src/control.c
index a35e4b8733bfd2c355568a3607eb30f84b8c2d7c..5cb8ce6f1099fde86343d81d8c7b147d41bc4864 100644
--- a/PG-PuReMD/src/control.c
+++ b/PG-PuReMD/src/control.c
@@ -20,32 +20,34 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "control.h"
-#include "tool_box.h"
+  #include "control.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_control.h"
-#include "reax_tool_box.h"
+  #include "reax_control.h"
+  #include "reax_tool_box.h"
 #endif
 
 
 char Read_Control_File( char *control_file, control_params* control,
-                        output_controls *out_control )
+        output_controls *out_control )
 {
     FILE *fp;
     char *s, **tmp;
-    int   c, i, ival;
-    real  val;
+    int c, i, ival;
+    real val;
 
     /* open control file */
     if ( (fp = fopen( control_file, "r" ) ) == NULL )
     {
-        fprintf( stderr, "error opening the control file! terminating...\n" );
+        fprintf( stderr, "[ERROR] cannot open the control file (%s)! terminating...\n",
+              control_file );
         MPI_Abort( MPI_COMM_WORLD,  FILE_NOT_FOUND );
     }
 
     /* assign default values */
-    strcpy( control->sim_name, "simulate" );
+    strcpy( control->sim_name, "default.sim" );
     control->ensemble        = NVE;
     control->nsteps          = 0;
     control->dt              = 0.25;
@@ -54,8 +56,9 @@ char Read_Control_File( char *control_file, control_params* control,
     control->procs_by_dim[1] = 1;
     control->procs_by_dim[2] = 1;
     control->geo_format = 1;
-    control->gpus_per_node = 1;   //hpcc
+    control->gpus_per_node = 1;
 
+    control->random_vel = 0;
     control->restart          = 0;
     out_control->restart_format = WRITE_BINARY;
     out_control->restart_freq = 0;
@@ -66,18 +69,29 @@ char Read_Control_File( char *control_file, control_params* control,
     out_control->energy_update_freq = 0;
 
     control->reneighbor = 1;
-    control->vlist_cut = control->nonb_cut;
     control->bond_cut = 5.0;
+    control->vlist_cut = control->nonb_cut;
     control->bg_cut = 0.3;
     control->thb_cut = 0.001;
     control->hbond_cut = 0.0;
 
     control->tabulate = 0;
 
-    control->qeq_freq = 1;
-    control->q_err = 1e-6;
-    control->refactor = 100;
-    control->droptol = 1e-2;;
+    control->charge_method = QEQ_CM;
+    control->charge_freq = 1;
+    control->cm_q_net = 0.0;
+    control->cm_solver_type = GMRES_S;
+    control->cm_solver_max_iters = 100;
+    control->cm_solver_restart = 50;
+    control->cm_solver_q_err = 0.000001;
+    control->cm_domain_sparsify_enabled = FALSE;
+    control->cm_domain_sparsity = 1.0;
+    control->cm_solver_pre_comp_type = DIAG_PC;
+    control->cm_solver_pre_comp_sweeps = 3;
+    control->cm_solver_pre_comp_refactor = 100;
+    control->cm_solver_pre_comp_droptol = 0.01;
+    control->cm_solver_pre_app_type = TRI_SOLVE_PA;
+    control->cm_solver_pre_app_jacobi_iters = 50;
 
     control->T_init = 0.;
     control->T_final = 300.;
@@ -108,15 +122,16 @@ char Read_Control_File( char *control_file, control_params* control,
     control->restrict_type = 0;
 
     /* memory allocations */
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    s = (char*) smalloc( sizeof(char) * MAX_LINE, "Read_Control_File::s" );
+    tmp = (char**) smalloc( sizeof(char*) * MAX_TOKENS, "Read_Control_File::tmp" );
     for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
+    {
+        tmp[i] = (char*) smalloc( sizeof(char) * MAX_LINE, "Read_Control_File::tmp[i]" );
+    }
 
     /* read control parameters file */
-    while (!feof(fp))
+    while( fgets( s, MAX_LINE, fp ) )
     {
-        fgets( s, MAX_LINE, fp );
         c = Tokenize( s, &tmp );
         //fprintf( stderr, "%s\n", s );
 
@@ -139,10 +154,10 @@ char Read_Control_File( char *control_file, control_params* control,
             val = atof(tmp[1]);
             control->dt = val * 1.e-3;  // convert dt from fs to ps!
         }
-        else if (strcmp(tmp[0], "gpus_per_node") == 0)      //hpcc
+        else if ( strcmp(tmp[0], "gpus_per_node") == 0 )
         {
-            val = atoi(tmp[1]);
-            control->gpus_per_node = val;
+            ival = atoi(tmp[1]);
+            control->gpus_per_node = ival;
         }
         else if ( strcmp(tmp[0], "proc_by_dim") == 0 )
         {
@@ -154,7 +169,7 @@ char Read_Control_File( char *control_file, control_params* control,
             control->procs_by_dim[2] = ival;
 
             control->nprocs = control->procs_by_dim[0] * control->procs_by_dim[1] *
-                              control->procs_by_dim[2];
+                    control->procs_by_dim[2];
         }
         //else if( strcmp(tmp[0], "restart") == 0 ) {
         //  ival = atoi(tmp[1]);
@@ -243,25 +258,79 @@ char Read_Control_File( char *control_file, control_params* control,
             ival = atoi( tmp[1] );
             control->tabulate = ival;
         }
-        else if ( strcmp(tmp[0], "qeq_freq") == 0 )
+        else if ( strcmp(tmp[0], "charge_method") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->charge_method = ival;
+        }
+        else if ( strcmp(tmp[0], "charge_freq") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->qeq_freq = ival;
+            control->charge_freq = ival;
         }
-        else if ( strcmp(tmp[0], "q_err") == 0 )
+        else if ( strcmp(tmp[0], "cm_q_net") == 0 )
         {
             val = atof( tmp[1] );
-            control->q_err = val;
+            control->cm_q_net = val;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_type = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_max_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_max_iters = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_restart") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_restart = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_q_err") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_solver_q_err = val;
+        }
+        else if ( strcmp(tmp[0], "cm_domain_sparsity") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_domain_sparsity = val;
+            if ( val < 1.0 )
+            {
+                control->cm_domain_sparsify_enabled = TRUE;
+            }
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_comp_type = ival;
         }
-        else if ( strcmp(tmp[0], "ilu_refactor") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_refactor") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->refactor = ival;
+            control->cm_solver_pre_comp_refactor = ival;
         }
-        else if ( strcmp(tmp[0], "ilu_droptol") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_droptol") == 0 )
         {
             val = atof( tmp[1] );
-            control->droptol = val;
+            control->cm_solver_pre_comp_droptol = val;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_sweeps") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_comp_sweeps = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_app_type = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_jacobi_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_app_jacobi_iters = ival;
         }
         else if ( strcmp(tmp[0], "temp_init") == 0 )
         {
@@ -269,7 +338,9 @@ char Read_Control_File( char *control_file, control_params* control,
             control->T_init = val;
 
             if ( control->T_init < 0.1 )
+            {
                 control->T_init = 0.1;
+            }
         }
         else if ( strcmp(tmp[0], "temp_final") == 0 )
         {
@@ -277,7 +348,9 @@ char Read_Control_File( char *control_file, control_params* control,
             control->T_final = val;
 
             if ( control->T_final < 0.1 )
+            {
                 control->T_final = 0.1;
+            }
         }
         else if ( strcmp(tmp[0], "t_mass") == 0 )
         {
@@ -401,7 +474,9 @@ char Read_Control_File( char *control_file, control_params* control,
         {
             control->num_ignored = atoi(tmp[1]);
             for ( i = 0; i < control->num_ignored; ++i )
+            {
                 control->ignore[atoi(tmp[i + 2])] = 1;
+            }
         }
         else if ( strcmp(tmp[0], "dipole_anal") == 0 )
         {
@@ -430,25 +505,34 @@ char Read_Control_File( char *control_file, control_params* control,
         }
         else
         {
-            fprintf( stderr, "WARNING: unknown parameter %s\n", tmp[0] );
-            MPI_Abort( MPI_COMM_WORLD, 15 );
+            fprintf( stderr, "[WARNING] unknown parameter %s\n", tmp[0] );
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
         }
     }
 
+    if ( ferror( fp ) )
+    {
+        fprintf( stderr, "[ERROR] parsing control file failed (I/O error). TERMINATING...\n" );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
+
     /* determine target T */
     if ( control->T_mode == 0 )
+    {
         control->T = control->T_final;
-    else control->T = control->T_init;
+    }
+    else
+    {
+        control->T = control->T_init;
+    }
 
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
-
-    // fprintf( stderr,"%d %d %10.5f %d %10.5f %10.5f\n",
-    //   control->ensemble, control->nsteps, control->dt,
-    //   control->tabulate, control->T, control->P );
+    {
+        sfree( tmp[i], "Read_Control_File::tmp[i]" );
+    }
+    sfree( tmp, "Read_Control_File::tmp" );
+    sfree( s, "Read_Control_File::s" );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "control file read\n" );
diff --git a/PG-PuReMD/src/control.h b/PG-PuReMD/src/control.h
index c6c6ce6c739181243b436a614b29fd28be07dfb5..24cf045176f7a3d82fdb7c71255debbe7b712326 100644
--- a/PG-PuReMD/src/control.h
+++ b/PG-PuReMD/src/control.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 char Read_Control_File( char*, control_params*, output_controls* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/cub b/PG-PuReMD/src/cub
new file mode 160000
index 0000000000000000000000000000000000000000..01347a797c620618d09e7d2d90bce4be4c42513e
--- /dev/null
+++ b/PG-PuReMD/src/cub
@@ -0,0 +1 @@
+Subproject commit 01347a797c620618d09e7d2d90bce4be4c42513e
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5df47e6f4fd55744b44b190d011f2e8a5b04f617
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -0,0 +1,853 @@
+
+#include "cuda_allocate.h"
+
+#include "cuda_allocate.h"
+#include "cuda_forces.h"
+#include "cuda_list.h"
+#include "cuda_neighbors.h"
+#include "cuda_utils.h"
+
+#include "../allocate.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
+
+extern "C"
+{
+
+
+void dev_alloc_control( control_params *control )
+{
+    cuda_malloc( (void **)&control->d_control_params,
+            sizeof(control_params), TRUE, "control_params" );
+    copy_host_device( control, control->d_control_params,
+            sizeof(control_params), cudaMemcpyHostToDevice, "control_params" );
+}
+
+
+CUDA_GLOBAL void Init_Nbrs( ivec *nbrs, int N )
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( index >= N )
+    {
+        return;
+    }
+
+    nbrs[index][0] = -1; 
+    nbrs[index][1] = -1; 
+    nbrs[index][2] = -1; 
+}
+
+
+void dev_alloc_grid( reax_system *system )
+{
+    int total;
+//    grid_cell local_cell;
+    grid *host = &system->my_grid;
+    grid *device = &system->d_my_grid;
+//    ivec *nbrs_x = (ivec *) scratch;
+
+    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+    ivec_Copy( device->ncells, host->ncells );
+    rvec_Copy( device->cell_len, host->cell_len );
+    rvec_Copy( device->inv_len, host->inv_len );
+
+    ivec_Copy( device->bond_span, host->bond_span );
+    ivec_Copy( device->nonb_span, host->nonb_span );
+    ivec_Copy( device->vlist_span, host->vlist_span );
+
+    ivec_Copy( device->native_cells, host->native_cells );
+    ivec_Copy( device->native_str, host->native_str );
+    ivec_Copy( device->native_end, host->native_end );
+
+    device->ghost_cut = host->ghost_cut;
+    ivec_Copy( device->ghost_span, host->ghost_span );
+    ivec_Copy( device->ghost_nonb_span, host->ghost_nonb_span );
+    ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span );
+    ivec_Copy( device->ghost_bond_span, host->ghost_bond_span );
+
+    cuda_malloc( (void **) &device->str, sizeof(int) * total, TRUE,
+            "dev_alloc_grid::grid->str" );
+    cuda_malloc( (void **) &device->end, sizeof(int) * total, TRUE,
+            "dev_alloc_grid::grid->end" );
+    cuda_malloc( (void **) &device->cutoff, sizeof(real) * total, TRUE,
+            "dev_alloc_grid::grid->cutoff" );
+
+    cuda_malloc( (void **) &device->nbrs_x, sizeof(ivec) * total * host->max_nbrs,
+            TRUE, "dev_alloc_grid::grid->nbrs_x" );
+    cuda_malloc( (void **) &device->nbrs_cp, sizeof(rvec) * total * host->max_nbrs,
+            TRUE, "dev_alloc_grid::grid->nbrs_cp" );
+    cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total,
+            TRUE, "dev_alloc_grid::grid->rel_box" );
+
+//    int block_size = 512;
+//    int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
+//
+//    Init_Nbrs <<< blocks, block_size >>>
+//        ( nbrs_x, host->max_nbrs );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    cuda_malloc( (void **)& device->cells, sizeof(grid_cell) * total,
+//            TRUE, "grid:cells");
+//    fprintf( stderr, " Device cells address --> %ld \n", device->cells );
+//    cuda_malloc( (void **) &device->order,
+//            sizeof(ivec) * (host->total + 1), TRUE, "grid:order" );
+//
+//    local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
+//    fprintf( stderr, "Total cells to be allocated -- > %d \n", total );
+//    for (int i = 0; i < total; i++)
+//    {
+//        //fprintf( stderr, "Address of the local atom -> %ld  \n", &local_cell );
+//
+//        cuda_malloc( (void **) &local_cell.atoms, sizeof(int) * host->max_atoms,
+//                TRUE, "alloc:grid:cells:atoms" );
+//        //fprintf( stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
+//
+//        cuda_malloc( (void **) &local_cell.nbrs_x, sizeof(ivec) * host->max_nbrs,
+//                TRUE, "alloc:grid:cells:nbrs_x" );
+//        copy_device( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec), "grid:nbrs_x" );
+//        //fprintf( stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x );
+//
+//        cuda_malloc( (void **) &local_cell.nbrs_cp, sizeof(rvec) * host->max_nbrs,
+//                TRUE, "alloc:grid:cells:nbrs_cp" );
+//        //fprintf( stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp );
+//
+//        //cuda_malloc( (void **) &local_cell.nbrs, sizeof(grid_cell *) * host->max_nbrs,
+//        //                TRUE, "alloc:grid:cells:nbrs" );
+//        //fprintf( stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs );
+//
+//        copy_host_device( &local_cell, &device->cells[i], sizeof(grid_cell),
+//                cudaMemcpyHostToDevice, "grid:cell-alloc" );
+//    }
+}
+
+
+void dev_dealloc_grid_cell_atoms( reax_system *system )
+{
+    int total;
+    grid_cell local_cell;
+    grid *host = &system->my_grid;
+    grid *device = &system->d_my_grid;
+
+    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+    for (int i = 0; i < total; i++)
+    {
+        copy_host_device( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyDeviceToHost,
+                "dev_dealloc_grid_cell_atoms::grid" );
+        cuda_free( local_cell.atoms,
+                "dev_dealloc_grid_cell_atoms::grid_cell.atoms" );
+    }
+}
+
+
+void dev_alloc_grid_cell_atoms( reax_system *system, int cap )
+{
+    int i, total;
+    grid_cell local_cell;
+    grid *host = &system->my_grid;
+    grid *device = &system->d_my_grid;
+
+    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+    for (i = 0; i < total; i++)
+    {
+        copy_host_device( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc" );
+        cuda_malloc( (void **)&local_cell.atoms, sizeof(int) * cap, 
+                TRUE, "realloc:grid:cells:atoms" );
+        copy_host_device( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc" );
+    }
+}
+
+
+void dev_alloc_system( reax_system *system )
+{
+    /* atoms */
+    cuda_malloc( (void **) &system->d_my_atoms,
+            system->total_cap * sizeof(reax_atom),
+            TRUE, "system:d_my_atoms" );
+    cuda_malloc( (void **) &system->d_numH, sizeof(int), TRUE, "system:d_numH" );
+
+    /* list management */
+    cuda_malloc( (void **) &system->d_far_nbrs,
+            system->total_cap * sizeof(int), TRUE, "system:d_far_nbrs" );
+    cuda_malloc( (void **) &system->d_max_far_nbrs,
+            system->total_cap * sizeof(int), TRUE, "system:d_max_far_nbrs" );
+    cuda_malloc( (void **) &system->d_total_far_nbrs,
+            sizeof(int), TRUE, "system:d_total_far_nbrs" );
+    cuda_malloc( (void **) &system->d_realloc_far_nbrs,
+            sizeof(int), TRUE, "system:d_realloc_far_nbrs" );
+
+    cuda_malloc( (void **) &system->d_bonds,
+            system->total_cap * sizeof(int), TRUE, "system:d_bonds" );
+    cuda_malloc( (void **) &system->d_max_bonds,
+            system->total_cap * sizeof(int), TRUE, "system:d_max_bonds" );
+    cuda_malloc( (void **) &system->d_total_bonds,
+            sizeof(int), TRUE, "system:d_total_bonds" );
+    cuda_malloc( (void **) &system->d_realloc_bonds,
+            sizeof(int), TRUE, "system:d_realloc_bonds" );
+
+    cuda_malloc( (void **) &system->d_hbonds,
+            system->total_cap * sizeof(int), TRUE, "system:d_hbonds" );
+    cuda_malloc( (void **) &system->d_max_hbonds,
+            system->total_cap * sizeof(int), TRUE, "system:d_max_hbonds" );
+    cuda_malloc( (void **) &system->d_total_hbonds,
+            sizeof(int), TRUE, "system:d_total_hbonds" );
+    cuda_malloc( (void **) &system->d_realloc_hbonds,
+            sizeof(int), TRUE, "system:d_realloc_hbonds" );
+
+    cuda_malloc( (void **) &system->d_cm_entries,
+            system->total_cap * sizeof(int), TRUE, "system:d_cm_entries" );
+    cuda_malloc( (void **) &system->d_max_cm_entries,
+            system->total_cap * sizeof(int), TRUE, "system:d_max_cm_entries" );
+    cuda_malloc( (void **) &system->d_total_cm_entries,
+            sizeof(int), TRUE, "system:d_total_cm_entries" );
+    cuda_malloc( (void **) &system->d_realloc_cm_entries,
+            sizeof(int), TRUE, "system:d_realloc_cm_entries" );
+
+    cuda_malloc( (void **) &system->d_total_thbodies,
+            sizeof(int), TRUE, "system:d_total_thbodies" );
+
+    /* simulation boxes */
+    cuda_malloc( (void **) &system->d_big_box,
+            sizeof(simulation_box), TRUE, "system:d_big_box" );
+    cuda_malloc( (void **) &system->d_my_box,
+            sizeof(simulation_box), TRUE, "system:d_my_box" );
+    cuda_malloc( (void **) &system->d_my_ext_box,
+            sizeof(simulation_box), TRUE, "d_my_ext_box" );
+
+    /* interaction parameters */
+    cuda_malloc( (void **) &system->reax_param.d_sbp,
+            system->reax_param.num_atom_types * sizeof(single_body_parameters),
+            TRUE, "system:d_sbp" );
+
+    cuda_malloc( (void **) &system->reax_param.d_tbp,
+            POW( system->reax_param.num_atom_types, 2.0 ) * sizeof(two_body_parameters), 
+            TRUE, "system:d_tbp" );
+
+    cuda_malloc( (void **) &system->reax_param.d_thbp,
+            POW( system->reax_param.num_atom_types, 3.0 ) * sizeof(three_body_header),
+            TRUE, "system:d_thbp" );
+
+    cuda_malloc( (void **) &system->reax_param.d_hbp,
+            POW( system->reax_param.num_atom_types, 3.0 ) * sizeof(hbond_parameters),
+            TRUE, "system:d_hbp" );
+
+    cuda_malloc( (void **) &system->reax_param.d_fbp,
+            POW( system->reax_param.num_atom_types, 4.0 ) * sizeof(four_body_header),
+            TRUE, "system:d_fbp" );
+
+    cuda_malloc( (void **) &system->reax_param.d_gp.l,
+            system->reax_param.gp.n_global * sizeof(real), TRUE, "system:d_gp.l" );
+
+    system->reax_param.d_gp.n_global = 0;
+    system->reax_param.d_gp.vdw_type = 0;
+}
+
+
+void dev_realloc_system( reax_system *system, int old_total_cap, int total_cap, char *msg )
+{
+    int *temp;
+    reax_atom *temp_atom;
+
+    temp = (int *) scratch;
+    temp_atom = (reax_atom*) scratch;
+
+    /* free the existing storage for atoms, leave other info allocated */
+    copy_device( temp_atom, system->d_my_atoms, old_total_cap * sizeof(reax_atom),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_my_atoms, "system::d_my_atoms" );
+    cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * total_cap, 
+            TRUE, "system::d_my_atoms" );
+    copy_device( system->d_my_atoms, temp, old_total_cap * sizeof(reax_atom),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_far_nbrs, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_far_nbrs, "system::d_far_nbrs" );
+    cuda_malloc( (void **) &system->d_far_nbrs,
+            system->total_cap * sizeof(int), TRUE, "system::d_far_nbrs" );
+    copy_device( system->d_far_nbrs, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_far_nbrs, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_max_far_nbrs, "system::d_max_far_nbrs" );
+    cuda_malloc( (void **) &system->d_max_far_nbrs,
+            system->total_cap * sizeof(int), TRUE, "system::d_max_far_nbrs" );
+    copy_device( system->d_max_far_nbrs, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_bonds, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_bonds, "system::d_bonds" );
+    cuda_malloc( (void **) &system->d_bonds,
+            system->total_cap * sizeof(int), TRUE, "system::d_bonds" );
+    copy_device( system->d_bonds, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_bonds, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_max_bonds, "system::d_max_bonds" );
+    cuda_malloc( (void **) &system->d_max_bonds,
+            system->total_cap * sizeof(int), TRUE, "system::d_max_bonds" );
+    copy_device( system->d_max_bonds, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_hbonds, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_hbonds, "system::d_hbonds" );
+    cuda_malloc( (void **) &system->d_hbonds,
+            system->total_cap * sizeof(int), TRUE, "system::d_hbonds" );
+    copy_device( system->d_hbonds, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_hbonds, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
+    cuda_malloc( (void **) &system->d_max_hbonds,
+            system->total_cap * sizeof(int), TRUE, "system::d_max_hbonds" );
+    copy_device( system->d_max_hbonds, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_cm_entries, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_cm_entries, "system::d_cm_entries" );
+    cuda_malloc( (void **) &system->d_cm_entries,
+            system->total_cap * sizeof(int), TRUE, "system::d_cm_entries" );
+    copy_device( system->d_cm_entries, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_cm_entries, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+    cuda_free( system->d_max_cm_entries, "system::d_max_cm_entries" );
+    cuda_malloc( (void **) &system->d_max_cm_entries,
+            system->total_cap * sizeof(int), TRUE, "system::d_max_cm_entries" );
+    copy_device( system->d_max_cm_entries, temp, old_total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+}
+
+
+void dev_alloc_simulation_data( simulation_data *data )
+{
+    cuda_malloc( (void **) &(data->d_simulation_data), sizeof(simulation_data), TRUE, "simulation_data" );
+}
+
+
+void dev_alloc_workspace( reax_system *system, control_params *control, 
+        storage *workspace, int local_cap, int total_cap, char *msg )
+{
+    int total_real, total_rvec, local_rvec;
+
+    workspace->allocated = TRUE;
+
+    total_real = total_cap * sizeof(real);
+    total_rvec = total_cap * sizeof(rvec);
+    local_rvec = local_cap * sizeof(rvec);
+
+    /* communication storage */  
+    /*
+       workspace->tmp_dbl = NULL;
+       workspace->tmp_rvec = NULL;
+       workspace->tmp_rvec2 = NULL;
+     */
+
+    /* bond order related storage  */
+    cuda_malloc( (void **) &workspace->within_bond_box, total_cap * sizeof (int), TRUE, "skin" );
+    cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" );
+    cuda_malloc( (void **) &workspace->Deltap, total_real, TRUE, "Deltap" );
+    cuda_malloc( (void **) &workspace->Deltap_boc, total_real, TRUE, "Deltap_boc" );
+    cuda_malloc( (void **) &workspace->dDeltap_self, total_rvec, TRUE, "dDeltap_self" );
+    cuda_malloc( (void **) &workspace->Delta, total_real, TRUE, "Delta" );
+    cuda_malloc( (void **) &workspace->Delta_lp, total_real, TRUE, "Delta_lp" );
+    cuda_malloc( (void **) &workspace->Delta_lp_temp, total_real, TRUE, "Delta_lp_temp" );
+    cuda_malloc( (void **) &workspace->dDelta_lp, total_real, TRUE, "Delta_lp_temp" );
+    cuda_malloc( (void **) &workspace->dDelta_lp_temp, total_real, TRUE, "dDelta_lp_temp" );
+    cuda_malloc( (void **) &workspace->Delta_e, total_real, TRUE, "Delta_e" );
+    cuda_malloc( (void **) &workspace->Delta_boc, total_real, TRUE, "Delta_boc" );
+    cuda_malloc( (void **) &workspace->nlp, total_real, TRUE, "nlp" );
+    cuda_malloc( (void **) &workspace->nlp_temp, total_real, TRUE, "nlp_temp" );
+    cuda_malloc( (void **) &workspace->Clp, total_real, TRUE, "Clp" );
+    cuda_malloc( (void **) &workspace->vlpex, total_real, TRUE, "vlpex" );
+    cuda_malloc( (void **) &workspace->bond_mark, total_real, TRUE, "bond_mark" );
+    cuda_malloc( (void **) &workspace->done_after, total_real, TRUE, "done_after" );
+
+
+    /* charge matrix storage */
+    cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" );
+    cuda_malloc( (void **) &workspace->b_s, total_cap * sizeof(real), TRUE, "b_s" );
+    cuda_malloc( (void **) &workspace->b_t, total_cap * sizeof(real), TRUE, "b_t" );
+    cuda_malloc( (void **) &workspace->b_prc, total_cap * sizeof(real), TRUE, "b_prc" );
+    cuda_malloc( (void **) &workspace->b_prm, total_cap * sizeof(real), TRUE, "b_prm" );
+    cuda_malloc( (void **) &workspace->s, total_cap * sizeof(real), TRUE, "s" );
+    cuda_malloc( (void **) &workspace->t, total_cap * sizeof(real), TRUE, "t" );
+    cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" );
+    cuda_malloc( (void **) &workspace->b, total_cap * sizeof(rvec2), TRUE, "b" );
+    cuda_malloc( (void **) &workspace->x, total_cap * sizeof(rvec2), TRUE, "x" );
+
+    /* GMRES storage */
+    cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" );
+    cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" );
+    cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" );
+    cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" );
+    cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" );
+    cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" );
+    cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" );
+
+    /* CG storage */
+    cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" );
+    cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" );
+    cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" );
+    cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" );
+    cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" );
+    cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" );
+    cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" );
+    cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" );
+
+    /* integrator storage */
+    cuda_malloc( (void **) &workspace->v_const, local_rvec, TRUE, "v_const" );
+
+    /* storage for analysis */
+    if( control->molecular_analysis || control->diffusion_coef )
+    {
+        cuda_malloc( (void **) &workspace->mark, local_cap * sizeof(int), TRUE, "mark" );
+        cuda_malloc( (void **) &workspace->old_mark, local_cap * sizeof(int), TRUE, "old_mark" );
+    }
+    else
+    {
+        workspace->mark = workspace->old_mark = NULL;
+    }
+
+    if( control->diffusion_coef )
+    {
+        cuda_malloc( (void **) &workspace->x_old, local_cap * sizeof(rvec), TRUE, "x_old" );
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
+
+    /* force related storage */
+    cuda_malloc( (void **) &workspace->f, total_cap * sizeof(rvec), TRUE, "f" );
+    cuda_malloc( (void **) &workspace->CdDelta, total_cap * sizeof(rvec), TRUE, "CdDelta" );
+
+    /* Taper params */
+    cuda_malloc( (void **) &workspace->Tap, 8 * sizeof(real), TRUE, "Tap" );
+}
+
+
+void dev_dealloc_workspace( control_params *control, storage *workspace )
+{
+    if ( workspace->allocated == FALSE )
+    {
+        return;
+    }
+
+    workspace->allocated = FALSE;
+
+    /* communication storage */  
+    /*
+       workspace->tmp_dbl = NULL;
+       workspace->tmp_rvec = NULL;
+       workspace->tmp_rvec2 = NULL;
+     */
+
+    /* bond order related storage  */
+    cuda_free( workspace->within_bond_box, "skin" );
+    cuda_free( workspace->total_bond_order, "total_bo" );
+    cuda_free( workspace->Deltap, "Deltap" );
+    cuda_free( workspace->Deltap_boc, "Deltap_boc" );
+    cuda_free( workspace->dDeltap_self, "dDeltap_self" );
+    cuda_free( workspace->Delta, "Delta" );
+    cuda_free( workspace->Delta_lp, "Delta_lp" );
+    cuda_free( workspace->Delta_lp_temp, "Delta_lp_temp" );
+    cuda_free( workspace->dDelta_lp, "Delta_lp_temp" );
+    cuda_free( workspace->dDelta_lp_temp, "dDelta_lp_temp" );
+    cuda_free( workspace->Delta_e, "Delta_e" );
+    cuda_free( workspace->Delta_boc, "Delta_boc" );
+    cuda_free( workspace->nlp, "nlp" );
+    cuda_free( workspace->nlp_temp, "nlp_temp" );
+    cuda_free( workspace->Clp, "Clp" );
+    cuda_free( workspace->vlpex, "vlpex" );
+    cuda_free( workspace->bond_mark, "bond_mark" );
+    cuda_free( workspace->done_after, "done_after" );
+
+    /* charge matrix storage */
+    cuda_free( workspace->Hdia_inv, "Hdia_inv" );
+    cuda_free( workspace->b_s, "b_s" );
+    cuda_free( workspace->b_t, "b_t" );
+    cuda_free( workspace->b_prc, "b_prc" );
+    cuda_free( workspace->b_prm, "b_prm" );
+    cuda_free( workspace->s, "s" );
+    cuda_free( workspace->t, "t" );
+    cuda_free( workspace->droptol, "droptol" );
+    cuda_free( workspace->b, "b" );
+    cuda_free( workspace->x, "x" );
+
+    /* GMRES storage */
+    cuda_free( workspace->y, "y" );
+    cuda_free( workspace->z, "z" );
+    cuda_free( workspace->g, "g" );
+    cuda_free( workspace->h, "h" );
+    cuda_free( workspace->hs, "hs" );
+    cuda_free( workspace->hc, "hc" );
+    cuda_free( workspace->v, "v" );
+
+    /* CG storage */
+    cuda_free( workspace->r, "r" );
+    cuda_free( workspace->d, "d" );
+    cuda_free( workspace->q, "q" );
+    cuda_free( workspace->p, "p" );
+    cuda_free( workspace->r2, "r2" );
+    cuda_free( workspace->d2, "d2" );
+    cuda_free( workspace->q2, "q2" );
+    cuda_free( workspace->p2, "p2" );
+
+    /* integrator storage */
+    cuda_free( workspace->v_const, "v_const" );
+
+    /* storage for analysis */
+    if( control->molecular_analysis || control->diffusion_coef )
+    {
+        cuda_free( workspace->mark, "mark" );
+        cuda_free( workspace->old_mark, "old_mark" );
+    }
+    else
+    {
+        workspace->mark = workspace->old_mark = NULL;
+    }
+
+    if( control->diffusion_coef )
+    {
+        cuda_free( workspace->x_old, "x_old" );
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
+
+    /* force related storage */
+    cuda_free( workspace->f, "f" );
+    cuda_free( workspace->CdDelta, "CdDelta" );
+
+    /* Taper params */
+    cuda_free( workspace->Tap, "Tap" );
+}
+
+
+void dev_alloc_matrix( sparse_matrix *H, int n, int m )
+{
+    H->m = m;
+    H->n = n;
+    cuda_malloc( (void **) &H->start, sizeof(int) * n, TRUE, "dev_alloc_matrix::start" );
+    cuda_malloc( (void **) &H->end, sizeof(int) * n, TRUE, "dev_alloc_matrix::end" );
+    cuda_malloc( (void **) &H->entries, sizeof(sparse_matrix_entry) * m, TRUE, "dev_alloc_matrix::entries" );
+}
+
+
+void dev_dealloc_matrix( sparse_matrix *H )
+{
+    cuda_free( H->start, "dev_dealloc_matrix::start" );
+    cuda_free( H->end, "dev_dealloc_matrix::end" );
+    cuda_free( H->entries, "dev_dealloc_matrix::entries" );
+}
+
+
+void Cuda_Reallocate_Neighbor_List( reax_list *far_nbrs, size_t n, size_t num_intrs )
+{
+    Dev_Delete_List( far_nbrs );
+    Dev_Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs );
+}
+
+
+void Cuda_Reallocate_HBonds_List( reax_list *hbonds, size_t n, size_t num_intrs )
+{
+    Dev_Delete_List( hbonds );
+    Dev_Make_List( n, num_intrs, TYP_HBOND, hbonds );
+}
+
+
+void Cuda_Reallocate_Bonds_List( reax_list *bonds, size_t n, size_t num_intrs )
+{
+    Dev_Delete_List( bonds );
+    Dev_Make_List( n, num_intrs, TYP_BOND, bonds );
+}
+
+
+void Cuda_Reallocate_Thbodies_List( reax_list *thbodies, size_t n, size_t num_intrs )
+{
+    Dev_Delete_List( thbodies );
+    Dev_Make_List( n, num_intrs, TYP_THREE_BODY, thbodies );
+
+}
+
+
+void Cuda_ReAllocate( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data )
+{
+    int i, j, k, p;
+    int nflag, Nflag, old_total_cap, mpi_flag, total_send;
+    int renbr;
+    reallocate_data *realloc;
+    reax_list *far_nbrs;
+    sparse_matrix *H;
+    grid *g;
+    neighbor_proc *nbr_pr;
+    mpi_out_data *nbr_data;
+    char msg[200];
+
+    realloc = &(dev_workspace->realloc);
+    g = &(system->my_grid);
+    H = &dev_workspace->H;
+
+    // IMPORTANT: LOOSE ZONES CHECKS ARE DISABLED FOR NOW BY &&'ing with 0!!!
+    nflag = FALSE;
+    if ( system->n >= DANGER_ZONE * system->local_cap ||
+            (0 && system->n <= LOOSE_ZONE * system->local_cap) )
+    {
+        nflag = TRUE;
+        system->local_cap = (int)(system->n * SAFE_ZONE);
+    }
+
+    Nflag = FALSE;
+    if ( system->N >= DANGER_ZONE * system->total_cap ||
+            (0 && system->N <= LOOSE_ZONE * system->total_cap) )
+    {
+        Nflag = TRUE;
+        old_total_cap = system->total_cap;
+        system->total_cap = (int)(system->N * SAFE_ZONE);
+    }
+
+    if ( Nflag == TRUE )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating system and workspace -"\
+                 "n=%d  N=%d  local_cap=%d  total_cap=%d\n",
+                 system->my_rank, system->n, system->N,
+                 system->local_cap, system->total_cap );
+        fprintf( stderr, "p:%d -  *** Allocating System *** \n", system->my_rank );
+#endif
+
+        /* system */
+        dev_realloc_system( system, old_total_cap, system->total_cap, msg );
+
+        /* workspace */
+        dev_dealloc_workspace( control, workspace );
+        dev_alloc_workspace( system, control, workspace, system->local_cap,
+                system->total_cap, msg );
+    }
+
+    /* far neighbors */
+    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
+    if ( renbr && (Nflag == TRUE || realloc->far_nbrs == TRUE) )
+    {
+        far_nbrs = *dev_lists + FAR_NBRS;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n",
+                 system->my_rank, system->total_far_nbrs,
+                 (int)(system->total_far_nbrs * sizeof(far_neighbor_data) /
+                       (1024.0 * 1024.0)) );
+        fprintf( stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank );
+#endif
+
+        Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, system->total_far_nbrs );
+
+        Cuda_Init_Neighbor_Indices( system );
+
+        realloc->far_nbrs = FALSE;
+    }
+
+    /* charge matrix */
+    if ( nflag == TRUE || realloc->cm == TRUE )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating H matrix: Htop=%d, space=%dMB\n",
+                system->my_rank, (int)(system->total_cm_entries),
+                (int)(system->total_cm_entries * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+#endif
+
+        dev_dealloc_matrix( H );
+        dev_alloc_matrix( H, system->total_cap, system->total_cm_entries );
+
+        Cuda_Init_Sparse_Matrix_Indices( system, H );
+
+        //Deallocate_Matrix( workspace->L );
+        //Deallocate_Matrix( workspace->U );
+        //workspace->L = NULL;
+        //workspace->U = NULL;
+
+        realloc->cm = FALSE;
+    }
+
+    /* hydrogen bonds list */
+    if ( control->hbond_cut > 0.0 && system->numH > 0 )
+    {
+
+        if ( Nflag == TRUE || realloc->hbonds == TRUE )
+        {
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "p%d: reallocating hbonds: total_hbonds=%d space=%dMB\n",
+                    system->my_rank, system->total_hbonds,
+                    (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+#endif
+
+            Cuda_Reallocate_HBonds_List( (*dev_lists) + HBONDS, system->total_cap, system->total_hbonds );
+
+            Cuda_Init_HBond_Indices( system );
+
+            realloc->hbonds = FALSE;
+        }
+    }
+
+    /* bonds list */
+    if ( Nflag == TRUE || realloc->bonds == TRUE )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating bonds: total_bonds=%d, space=%dMB\n",
+                 system->my_rank, system->total_bonds,
+                 (int)(system->total_bonds * sizeof(bond_data) / (1024 * 1024)) );
+#endif
+
+        Cuda_Reallocate_Bonds_List( (*dev_lists) + BONDS, system->total_cap, system->total_bonds );
+
+        Cuda_Init_Bond_Indices( system );
+
+        realloc->bonds = FALSE;
+    }
+
+    /* 3-body list */
+    if ( Nflag == TRUE || realloc->thbody == TRUE )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating thbody list: num_thbody=%d, space=%dMB\n",
+                system->my_rank, system->total_thbodies,
+                (int)(system->total_thbodies * sizeof(three_body_interaction_data) /
+                (1024*1024)) );
+#endif
+
+        Cuda_Reallocate_Thbodies_List( (*dev_lists) + THREE_BODIES,
+                system->total_thbodies_indices, system->total_thbodies );
+
+        realloc->thbody = FALSE;
+    }
+
+    /* grid */
+    if ( renbr && realloc->gcell_atoms > -1 )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms );
+#endif
+
+        for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
+        {
+            for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
+            {
+                for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
+                {
+                    // reallocate g->atoms
+                    sfree( g->cells[ index_grid_3d(i,j,k,g) ].atoms, "g:atoms" );
+                    g->cells[ index_grid_3d(i,j,k,g) ].atoms = (int*)
+                            scalloc( realloc->gcell_atoms, sizeof(int), "g:atoms" );
+                }
+            }
+        }
+
+        //TODO
+        //do the same thing for the device here.
+        fprintf( stderr, "p:%d - *** Reallocating Grid Cell Atoms *** Step:%d\n", system->my_rank, data->step );
+        //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+
+        //FIX - 1 - Tested the reallocation logic
+        //dev_dealloc_grid_cell_atoms( system );
+        //dev_alloc_grid_cell_atoms( system, realloc->gcell_atoms );
+        realloc->gcell_atoms = -1;
+    }
+
+    /* mpi buffers */
+    // we have to be at a renbring step -
+    // to ensure correct values at mpi_buffers for update_boundary_positions
+    if ( !renbr )
+    {
+        mpi_flag = FALSE;
+    }
+    // check whether in_buffer capacity is enough
+    else if ( system->max_recved >= system->est_recv * 0.90 )
+    {
+        mpi_flag = TRUE;
+    }
+    else
+    {
+        // otherwise check individual outgoing buffers
+        mpi_flag = FALSE;
+        for ( p = 0; p < MAX_NBRS; ++p )
+        {
+            nbr_pr = &( system->my_nbrs[p] );
+            nbr_data = &( mpi_data->out_buffers[p] );
+
+            if ( nbr_data->cnt >= nbr_pr->est_send * 0.90 )
+            {
+                mpi_flag = TRUE;
+                break;
+            }
+        }
+    }
+
+    if ( mpi_flag == TRUE )
+    {
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating mpi_buf: old_recv=%d\n",
+                 system->my_rank, system->est_recv );
+        for ( p = 0; p < MAX_NBRS; ++p )
+        {
+            fprintf( stderr, "p%d: nbr%d old_send=%d\n",
+                     system->my_rank, p, system->my_nbrs[p].est_send );
+        }
+#endif
+
+        /* update mpi buffer estimates based on last comm */
+        system->est_recv = MAX( system->max_recved * SAFER_ZONE, MIN_SEND );
+        system->est_trans =
+            (system->est_recv * sizeof(boundary_atom)) / sizeof(mpi_atom);
+        total_send = 0;
+        for ( p = 0; p < MAX_NBRS; ++p )
+        {
+            nbr_pr   = &( system->my_nbrs[p] );
+            nbr_data = &( mpi_data->out_buffers[p] );
+            nbr_pr->est_send = MAX( nbr_data->cnt * SAFER_ZONE, MIN_SEND );
+            total_send += nbr_pr->est_send;
+        }
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: reallocating mpi_buf: recv=%d send=%d total=%dMB\n",
+                system->my_rank, system->est_recv, total_send,
+                (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
+                      (1024 * 1024)));
+
+        for ( p = 0; p < MAX_NBRS; ++p )
+        {
+            fprintf( stderr, "p%d: nbr%d new_send=%d\n",
+                    system->my_rank, p, system->my_nbrs[p].est_send );
+        }
+#endif
+
+        /* reallocate mpi buffers */
+        Deallocate_MPI_Buffers( mpi_data );
+        Allocate_MPI_Buffers( mpi_data, system->est_recv, system->my_nbrs, msg );
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: reallocate done\n",
+             system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+}
+
+
+}
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.h b/PG-PuReMD/src/cuda/cuda_allocate.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d78d93264f21ac37fc2f7b3a715bdd75c90de78
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_allocate.h
@@ -0,0 +1,41 @@
+#ifndef __CUDA_ALLOCATE_H_
+#define __CUDA_ALLOCATE_H_
+
+#include "../reax_types.h"
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+
+void dev_alloc_system( reax_system * );
+
+void dev_alloc_grid( reax_system * );
+
+void dev_alloc_simulation_data( simulation_data * );
+
+void dev_alloc_workspace( reax_system *, control_params *, storage *, int, int, char * );
+
+void dev_alloc_matrix( sparse_matrix *, int, int );
+
+void dev_alloc_control( control_params * );
+
+void dev_dealloc_grid_cell_atoms( reax_system * );
+
+void dev_alloc_grid_cell_atoms( reax_system *, int );
+
+void dev_realloc_system( reax_system *, int , int , char * );
+
+void dev_dealloc_workspace( control_params *, storage * );
+
+void dev_dealloc_matrix( sparse_matrix * );
+
+void Cuda_ReAllocate( reax_system*, control_params*, simulation_data*, storage*,
+        reax_list**, mpi_datatypes* );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f16ddcbc50d4b5c9e77ec8042be99c76a4971a39
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -0,0 +1,888 @@
+
+#include "cuda_bond_orders.h"
+
+#include "cuda_list.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../index_utils.h"
+#include "../bond_orders.h"
+
+
+CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, 
+        single_body_parameters *sbp, storage p_workspace, int N )
+{
+    int i, type_i;
+    single_body_parameters *sbp_i;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    type_i = my_atoms[i].type;
+    sbp_i = &( sbp[type_i] );
+    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+    workspace->Deltap_boc[i] = 
+        workspace->total_bond_order[i] - sbp_i->valency_val;
+    workspace->total_bond_order[i] = 0; 
+}
+
+
+CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        storage p_workspace, reax_list p_bonds, 
+        int num_atom_types, int N )
+{
+    int i, j, pj, type_i, type_j;
+    int start_i, end_i;
+//    int sym_index;
+    real val_i, Deltap_i, Deltap_boc_i;
+    real val_j, Deltap_j, Deltap_boc_j;
+    real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    real exp_p1i, exp_p2i, exp_p1j, exp_p2j;
+    real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    real Cf45_ij, Cf45_ji;
+    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    real p_boc1, p_boc2;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+//    bond_order_data *bo_ji;
+    storage *workspace;
+    reax_list *bonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace = &(p_workspace);
+    bonds = &(p_bonds);
+    p_boc1 = gp.l[0];
+    p_boc2 = gp.l[1];
+
+    /* Corrected Bond Order calculations */
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    val_i = sbp_i->valency;
+    Deltap_i = workspace->Deltap[i];
+    Deltap_boc_i = workspace->Deltap_boc[i];
+    start_i = Dev_Start_Index( i, bonds );
+    end_i = Dev_End_Index( i, bonds );
+
+    // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
+    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
+
+    for( pj = start_i; pj < end_i; ++pj )
+    {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = my_atoms[j].type;
+        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+        // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+
+        //TODO
+        //if( i < j || workspace->bond_mark[j] > 3 ) {
+        if( i < j )
+        {
+            twbp = &( tbp[ index_tbp(type_i, type_j, num_atom_types)] );
+
+#ifdef TEST_FORCES
+            Set_Start_Index( pj, top_dbo, dBOs );
+            /* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", 
+               workspace->reverse_map[i], workspace->reverse_map[j],
+               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+#endif
+
+            if ( twbp->ovc < 0.001 && twbp->v13cor < 0.001 )
+            {
+                /* There is no correction to bond orders nor to derivatives
+                   of bond order prime! So we leave bond orders unchanged and
+                   set derivative of bond order coefficients such that 
+                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                bo_ij->C1dbo = 1.000000;
+                bo_ij->C2dbo = 0.000000;
+                bo_ij->C3dbo = 0.000000;
+
+                bo_ij->C1dbopi = bo_ij->BO_pi;
+                bo_ij->C2dbopi = 0.000000;
+                bo_ij->C3dbopi = 0.000000;
+                bo_ij->C4dbopi = 0.000000;
+
+                bo_ij->C1dbopi2 = bo_ij->BO_pi2;
+                bo_ij->C2dbopi2 = 0.000000;
+                bo_ij->C3dbopi2 = 0.000000;
+                bo_ij->C4dbopi2 = 0.000000;
+
+#ifdef TEST_FORCES
+                pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
+
+                // compute dBO_ij/dr_i
+                pdbo->wrt = i;
+                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
+
+                // compute dBO_ij/dr_j
+                pdbo++;
+                pdbo->wrt = j;
+                rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
+
+                top_dbo += 2;
+#endif
+            }
+            else
+            {
+                val_j = sbp[type_j].valency;
+                Deltap_j = workspace->Deltap[j];
+                Deltap_boc_j = workspace->Deltap_boc[j];
+
+                /* on page 1 */
+                if ( twbp->ovc >= 0.001 )
+                {
+                    /* Correction for overcoordination */
+                    exp_p1i = EXP( -p_boc1 * Deltap_i );
+                    exp_p2i = EXP( -p_boc2 * Deltap_i );
+                    exp_p1j = EXP( -p_boc1 * Deltap_j );
+                    exp_p2j = EXP( -p_boc2 * Deltap_j );
+
+                    f2 = exp_p1i + exp_p1j;
+                    f3 = -1.0 / p_boc2 * LOG( 0.5 * ( exp_p2i  + exp_p2j ) );
+                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) +
+                            ( val_j + f2 )/( val_j + f2 + f3 ) );
+
+                    /*fprintf( stderr,"%d %d\t%g %g   j:%g %g  p_boc:%g %g\n"
+                      "\tf:%g  %g  %g, exp:%g %g %g %g\n", 
+                      i+1, j+1, 
+                      val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2,
+                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
+
+                    /* Now come the derivates */
+                    /* Bond Order pages 5-7, derivative of f1 */
+                    temp = f2 + f3;
+                    u1_ij = val_i + temp;
+                    u1_ji = val_j + temp;
+                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) +
+                            1.0 / SQR( u1_ji ));
+                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
+                            ( u1_ji - f3 ) / SQR( u1_ji ));
+
+                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
+                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
+                            ((val_i+f2) / SQR(u1_ij)) *
+                            ( -p_boc1 * exp_p1i +
+                              exp_p2i / ( exp_p2i + exp_p2j ) ) +
+                            -p_boc1 * exp_p1i / u1_ji -
+                            ((val_j+f2) / SQR(u1_ji)) *
+                            ( -p_boc1 * exp_p1i +
+                              exp_p2i / ( exp_p2i + exp_p2j ) ));
+
+                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
+                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
+
+                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                }
+                else
+                {
+                    /* No overcoordination correction! */
+                    f1 = 1.0;
+                    Cf1_ij = Cf1_ji = 0.0;
+                }
+
+                if ( twbp->v13cor >= 0.001 )
+                {
+                    /* Correction for 1-3 bond orders */
+                    exp_f4 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                    exp_f5 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+
+                    f4 = 1. / (1. + exp_f4);
+                    f5 = 1. / (1. + exp_f5);
+                    f4f5 = f4 * f5;
+
+                    /* Bond Order pages 8-9, derivative of f4 and f5 */
+                    Cf45_ij = -f4 * exp_f4;
+                    Cf45_ji = -f5 * exp_f5;
+                }
+                else
+                {
+                    f4 = f5 = f4f5 = 1.0;
+                    Cf45_ij = Cf45_ji = 0.0;
+                }
+
+                /* Bond Order page 10, derivative of total bond order */
+                A0_ij = f1 * f4f5;
+                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
+                    (Cf45_ij + Cf45_ji);
+                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                A3_ij = A2_ij + Cf1_ij / f1;
+                A3_ji = A2_ji + Cf1_ji / f1;
+
+                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" 
+                  "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
+                  bo_ij->BO, 
+                  A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
+
+                /* find corrected bond orders and their derivative coef */
+                bo_ij->BO = bo_ij->BO * A0_ij;
+                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+                bo_ij->BO_pi2 = bo_ij->BO_pi2* A0_ij *f1;
+                bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+
+                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                bo_ij->C3dbo = bo_ij->BO * A2_ji;
+
+                bo_ij->C1dbopi = f1*f1*f4*f5;
+                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+
+                bo_ij->C1dbopi2 = f1*f1*f4*f5;
+                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+
+                //CHANGE ORIGINAL
+            }
+            //CHANGE ORIGINAL
+
+            /* neglect bonds that are < 1e-10 */
+            if ( bo_ij->BO < 1e-10 )
+            {
+                bo_ij->BO = 0.0;
+            }
+            if ( bo_ij->BO_s < 1e-10 )
+            {
+                bo_ij->BO_s = 0.0;
+            }
+            if ( bo_ij->BO_pi < 1e-10 )
+            {
+                bo_ij->BO_pi = 0.0;
+            }
+            if ( bo_ij->BO_pi2 < 1e-10 )
+            {
+                bo_ij->BO_pi2 = 0.0;
+            }
+
+            workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
+
+            /* fprintf( stderr, "%d %d\t%g %g %g %g\n"
+               "Cdbo:\t%g %g %g\n"
+               "Cdbopi:\t%g %g %g %g\n"
+               "Cdbopi2:%g %g %g %g\n\n", 
+               i+1, j+1, 
+               bonds->select.bond_list[ pj ].d, 
+               bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
+               bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
+               bo_ij->C1dbopi, bo_ij->C2dbopi, 
+               bo_ij->C3dbopi, bo_ij->C4dbopi,
+               bo_ij->C1dbopi2,bo_ij->C2dbopi2, 
+               bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
+
+            /* fprintf( stderr, "%d %d  BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
+               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/
+
+#ifdef TEST_FORCES
+            Set_End_Index( pj, top_dbo, dBOs );
+            Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+#endif
+
+            //CHANGE ORIGINAL
+            //}
+            //CHANGE ORIGINAL
+            /*
+               else {
+            // We only need to update bond orders from bo_ji
+            //   everything else is set in uncorrected_bo calculations
+            sym_index = bonds->select.bond_list[pj].sym_index;
+            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+#ifdef TEST_FORCES
+            Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
+#endif
+}
+             */
+        }
+    }
+}
+
+
+CUDA_GLOBAL void Cuda_Update_Uncorrected_BO( storage p_workspace,
+        reax_list p_bonds, int N )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int sym_index;
+    storage *workspace;
+    reax_list *bonds;
+    bond_order_data *bo_ij, *bo_ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    bonds = &( p_bonds );
+    start_i = Dev_Start_Index( i, bonds );
+    end_i = Dev_End_Index( i, bonds );
+
+    for( pj = start_i; pj < end_i; ++pj )
+    {
+
+        j = bonds->select.bond_list[pj].nbr;
+        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+        //if( (i >= j)  || (workspace->bond_mark [i] <= 3)) {
+        if ( i >= j )
+        {
+            /* We only need to update bond orders from bo_ji
+               everything else is set in uncorrected_bo calculations */
+            sym_index = bonds->select.bond_list[pj].sym_index;
+            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+            // now keeps total_BO
+            workspace->total_bond_order[i] += bo_ij->BO;
+        }
+    }
+}
+
+
+CUDA_GLOBAL void Cuda_Update_Workspace_After_BO( reax_atom *my_atoms,
+        global_parameters gp, single_body_parameters *sbp,
+        storage p_workspace, int N )
+{
+    int j, type_j;
+    real explp1, p_lp1;
+    single_body_parameters *sbp_j;
+    storage *workspace;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= N )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    p_lp1 = gp.l[15];
+
+    /* Calculate some helper variables that are  used at many places
+       throughout force calculations */
+    //for( j = 0; j < system->N; ++j ){
+    type_j = my_atoms[j].type;
+    sbp_j = &(sbp[ type_j ]);
+
+    workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
+    workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
+    workspace->Delta_boc[j] = workspace->total_bond_order[j] -
+        sbp_j->valency_boc;
+
+    workspace->vlpex[j] = workspace->Delta_e[j] -
+        2.0 * (int)(workspace->Delta_e[j]/2.0);
+    explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
+    workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
+    workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
+    workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
+    /* Adri uses different dDelta_lp values than the ones in notes... */
+    workspace->dDelta_lp[j] = workspace->Clp[j];
+    //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+    //((FABS(workspace->Delta_e[j]/2.0 -
+    //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+
+    if( sbp_j->mass > 21.0 )
+    {
+        workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+        workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+        workspace->dDelta_lp_temp[j] = 0.;
+    }
+    else
+    {
+        workspace->nlp_temp[j] = workspace->nlp[j];
+        workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+        workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+    }
+    //} Commented for Cuda
+}
+
+
+CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
+        simulation_data *data, storage *workspace, reax_list *bonds,
+        rvec data_ext_press )
+{
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji;
+    dbond_coefficients coef;
+    rvec temp, ext_press;
+    ivec rel_box;
+    int pk, k, j;
+
+    /* Initializations */
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+
+    //bo_ij = &(nbr_j->bo_data);
+    //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    if (i < j)
+    {
+        bo_ij = &(nbr_j->bo_data);
+        bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+    else
+    {
+        bo_ji = &(nbr_j->bo_data);
+        bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+    /************************************
+     * forces related to atom i          *
+     * first neighbors of atom i         *
+     ************************************/
+    if (i < j)
+    {
+        for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
+        {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+
+            rvec_MakeZero( nbr_k->tf_f );
+
+            rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd, dBO*/
+            rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
+            rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd, dBOpi*/
+            rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd, dBOpi2*/
+
+            /* force */
+            rvec_Add( nbr_k->tf_f, temp );
+            /* pressure */
+            rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+            rvec_Add( data_ext_press, ext_press );
+
+            /* if( !ivec_isZero( nbr_k->rel_box ) )
+               fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
+               "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+               i+1, system->my_atoms[i].x[0], 
+               system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
+               j+1, k+1, system->my_atoms[k].x[0], 
+               system->my_atoms[k].x[1], system->my_atoms[k].x[2],
+               nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+               nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
+               temp[0], temp[1], temp[2] ); */
+        }
+
+        /* then atom i itself  */
+        rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st,dBO*/
+        rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd,dBO*/
+        rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st,dBO*/
+        rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/
+        rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+        rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+        rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/
+
+        rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );  /*1st,dBO_pi2*/
+        rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );         /*2nd,dBO_pi2*/
+        rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/
+
+        /* force */
+        rvec_Add( workspace->f[i], temp );
+        /* ext pressure due to i is dropped, counting force on j will be enough */
+    }
+    else
+    {
+        /******************************************************
+         * forces and pressure related to atom j               * 
+         * first neighbors of atom j                           *
+         ******************************************************/
+        for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk )
+        {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+
+            rvec_MakeZero (nbr_k->tf_f);
+
+            rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );      /*3rd,dBO*/
+            rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
+            rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/
+            rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/
+
+            /* force */
+            rvec_Add( nbr_k->tf_f, temp );
+            /* pressure */
+            if( k != i )
+            {
+                ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i)
+                rvec_iMultiply( ext_press, rel_box, temp );
+                rvec_Add( data_ext_press, ext_press );
+
+                /* if( !ivec_isZero( rel_box ) )
+                   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
+                   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+                   i+1, j+1, system->my_atoms[j].x[0], 
+                   system->my_atoms[j].x[1], system->my_atoms[j].x[2], 
+                   k+1, system->my_atoms[k].x[0], 
+                   system->my_atoms[k].x[1], system->my_atoms[k].x[2],
+                   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+                   rel_box[0], rel_box[1], rel_box[2],
+                   temp[0], temp[1], temp[2] ); */
+            }
+        }
+
+        /* then atom j itself */
+        rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                    /*1st, dBO*/
+        rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );  /*2nd, dBO*/
+        rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );             /*1st, dBO*/
+        rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/
+
+        rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );       /*1st,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );             /*2nd,dBOpi*/
+        rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/
+
+        rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );    /*1st,dBOpi2*/
+        rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );           /*2nd,dBOpi2*/
+        rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+
+        /* force */
+        rvec_Add( workspace->f[j], temp );
+        /* pressure */
+        rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+        rvec_Add( data->my_ext_press, ext_press );
+
+        /* if( !ivec_isZero( nbr_j->rel_box ) )
+           fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" 
+           "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+           i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], 
+           system->my_atoms[i].x[2], 
+           j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], 
+           system->my_atoms[j].x[2],
+           j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
+           nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
+           temp[0], temp[1], temp[2] ); */
+    }
+}
+
+
+CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
+        storage *workspace, reax_list *bonds )
+{
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji;
+    dbond_coefficients coef;
+    int pk, j;
+    rvec tf_f;
+
+    rvec_MakeZero( tf_f );
+
+    /* Initializations */
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    //bo_ij = &(nbr_j->bo_data);
+    //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+    if ( i < j )
+    {
+        bo_ij = &(nbr_j->bo_data);
+        bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+    else
+    {
+        bo_ji = &(nbr_j->bo_data);
+        bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+    if ( i < j )
+    {
+        for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
+        {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            rvec_MakeZero( tf_f );
+
+            /*2nd,dBO*/
+            rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp );
+            /*dDelta*/
+            rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi*/
+            rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi2*/
+            rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+
+            //Temp storage
+            rvec_Add( nbr_k->tf_f, tf_f );
+        }
+        /*1st, dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp );
+        /*2nd, dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] );
+
+        /*1st, dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp );
+        /*2nd, dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] );
+
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp );
+        /*3rd, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] );
+
+        /*1st, dBO_pi2*/
+        rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*2nd, dBO_pi2*/
+        rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp );
+        /*3rd, dBO_pi2*/
+        rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] );
+
+    }
+    else
+    {
+        for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
+        {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            rvec_MakeZero( tf_f );
+
+            /*3rd, dBO*/
+            rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+            /*dDelta*/
+            rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi*/
+            rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi2*/
+            rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+
+            //Temp Storage
+            rvec_Add( nbr_k->tf_f, tf_f );
+        }
+
+        /*1st,dBO*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp );
+        /*2nd,dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] );
+
+        /*1st, dBO*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp );
+        /*2nd, dBO*/
+        rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] );
+
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp );
+        /*3rd, dBOpi*/
+        rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] );
+
+        /*1st, dBOpi2*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*2nd, dBOpi2*/
+        rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp );
+        /*3rd, dBOpi2*/
+        rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] );
+    }
+}
+
+
+CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess( int i, reax_atom *atoms,
+        reax_list *bonds, storage *workspace )
+{
+    int pk;
+    bond_data *nbr_k, *nbr_k_sym;
+
+    for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
+    {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
+
+        //rvec_Add( atoms[i].f, nbr_k_sym->tf_f );
+        rvec_Add( workspace->f[i], nbr_k_sym->tf_f );
+    }
+}
+
+
+CUDA_GLOBAL void k_total_forces_postprocess( reax_atom *my_atoms,
+        reax_list p_bonds, storage p_workspace, int N )
+{
+    int i;
+    reax_list *bonds;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    bonds = &p_bonds;
+    workspace = &p_workspace;
+
+    Cuda_dbond_to_Forces_postprocess( i, my_atoms, bonds, workspace );
+}
+
+
+CUDA_GLOBAL void k_total_forces( storage p_workspace, reax_list p_bonds, 
+        control_params *control, simulation_data *data, rvec *data_ext_press,
+        int N )
+{
+    int i, pj;
+    reax_list *bonds;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    bonds = &( p_bonds );
+    workspace = &( p_workspace );
+
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
+        //if( i < bonds->select.bond_list[pj].nbr ) {
+        if( control->virial == 0 )
+        {
+            Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds );
+        }
+        else 
+        {
+            Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds,
+                    data_ext_press[i] );
+        }
+    }
+}
+
+
+void Cuda_Total_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace )
+{
+    int blocks;
+    rvec *spad_rvec = (rvec *) scratch;
+
+    cuda_memset( spad_rvec, 0, system->N * 2 * sizeof(rvec),
+            "total_forces:ext_press" );
+
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_total_forces <<< blocks, DEF_BLOCK_SIZE >>>
+        ( *dev_workspace, *(*dev_lists + BONDS), 
+          (control_params *) control->d_control_params, 
+          (simulation_data *)data->d_simulation_data, 
+          spad_rvec, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    if ( control->virial != 0 )
+    {
+        //do the reduction here for ext press
+        k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> 
+            ( spad_rvec, spad_rvec + system->N, system->N );
+        cudaThreadSynchronize( ); 
+        cudaCheckError( ); 
+
+        k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
+            ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks );
+        cudaThreadSynchronize( ); 
+        cudaCheckError( ); 
+    }
+
+    //do the post processing for the atomic forces here
+    k_total_forces_postprocess  <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( ); 
+}
+
+
+CUDA_GLOBAL void k_total_forces_pure( reax_atom *my_atoms, int n, 
+        storage p_workspace )
+{
+    int i;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    workspace = &p_workspace;
+
+    rvec_Copy( my_atoms[i].f, workspace->f[i] );
+}
+
+
+void Cuda_Total_Forces_PURE( reax_system *system, storage *workspace )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->n, *dev_workspace);
+    cudaThreadSynchronize( ); 
+    cudaCheckError( ); 
+}
diff --git a/PG-PuReMD/src/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h
similarity index 62%
rename from PG-PuReMD/src/cuda_bond_orders.h
rename to PG-PuReMD/src/cuda/cuda_bond_orders.h
index d087aa26dbb967e6ac9168e86326f2f5fc6a74eb..a957b11bbd2a083dd03e4601341ab10af2f60af1 100644
--- a/PG-PuReMD/src/cuda_bond_orders.h
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h
@@ -2,38 +2,33 @@
 #ifndef __CUDA_BOND_ORDERS_H__
 #define __CUDA_BOND_ORDERS_H__
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
-#include "vector.h"
+#include "../vector.h"
 
 extern "C" {
 
-    void Cuda_Total_Forces (reax_system *, control_params *, simulation_data *, storage *);
-    void Cuda_Total_Forces_PURE (reax_system *, storage *);
+void Cuda_Total_Forces( reax_system *, control_params *, simulation_data *, storage * );
+void Cuda_Total_Forces_PURE( reax_system *, storage * );
 
 }
 
-CUDA_GLOBAL void Cuda_Calculate_BO_init (reax_atom *,
-        single_body_parameters *,
-        storage ,
-        int );
+CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *,
+        single_body_parameters *, storage , int );
 
-CUDA_GLOBAL void Cuda_Calculate_BO (reax_atom *, global_parameters ,
-                                    single_body_parameters *, two_body_parameters *,
-                                    storage , reax_list ,
-                                    int , int );
+CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *,
+        storage , reax_list , int , int );
 
-CUDA_GLOBAL void Cuda_Update_Uncorrected_BO (storage , reax_list , int );
+CUDA_GLOBAL void Cuda_Update_Uncorrected_BO( storage , reax_list , int );
 
-CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *, global_parameters ,
-        single_body_parameters *, storage ,
-        int );
+CUDA_GLOBAL void Cuda_Update_Workspace_After_BO( reax_atom *, global_parameters ,
+        single_body_parameters *, storage , int );
 
-CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
-                                    int i, int btop_i, far_neighbor_data *nbr_pj,
-                                    single_body_parameters *sbp_i, single_body_parameters *sbp_j,
-                                    two_body_parameters *twbp, rvec *dDeltap_self, real *total_bond_order )
+CUDA_DEVICE static inline int Dev_BOp( reax_list bonds, real bo_cut,
+        int i, int btop_i, far_neighbor_data *nbr_pj,
+        single_body_parameters *sbp_i, single_body_parameters *sbp_j,
+        two_body_parameters *twbp, rvec *dDeltap_self, real *total_bond_order )
 {
 
     int j, btop_j;
@@ -42,35 +37,43 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
     real BO, BO_s, BO_pi, BO_pi2;
     bond_data *ibond, *jbond;
     bond_order_data *bo_ij, *bo_ji;
-
     rvec bo_ij_dln_BOp_s;
     rvec bo_ij_dln_BOp_pi;
     rvec bo_ij_dln_BOp_pi2;
     rvec bo_ij_dBOp;
 
     j = nbr_pj->nbr;
-    r2 = SQR(nbr_pj->d);
+    r2 = SQR( nbr_pj->d );
 
     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
     {
         C12 = twbp->p_bo1 * POW( nbr_pj->d / twbp->r_s, twbp->p_bo2 );
         BO_s = (1.0 + bo_cut) * EXP( C12 );
     }
-    else BO_s = C12 = 0.0;
+    else
+    {
+        BO_s = C12 = 0.0;
+    }
 
     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
     {
         C34 = twbp->p_bo3 * POW( nbr_pj->d / twbp->r_p, twbp->p_bo4 );
         BO_pi = EXP( C34 );
     }
-    else BO_pi = C34 = 0.0;
+    else
+    {
+        BO_pi = C34 = 0.0;
+    }
 
     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
     {
         C56 = twbp->p_bo5 * POW( nbr_pj->d / twbp->r_pp, twbp->p_bo6 );
         BO_pi2 = EXP( C56 );
     }
-    else BO_pi2 = C56 = 0.0;
+    else
+    {
+        BO_pi2 = C56 = 0.0;
+    }
 
     /* Initially BO values are the uncorrected ones, page 1 */
     BO = BO_s + BO_pi + BO_pi2;
@@ -84,7 +87,7 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
         Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-        if (i < j)
+        if ( i < j )
         {
             ibond = &( bonds.select.bond_list[btop_i] );
             ibond->nbr = j;
@@ -92,8 +95,8 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             rvec_Copy( ibond->dvec, nbr_pj->dvec );
             ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
 
-            //   //ibond->dbond_index = btop_i;
-            //   //ibond->sym_index = btop_j;
+            //ibond->dbond_index = btop_i;
+            //ibond->sym_index = btop_j;
 
             bo_ij = &( ibond->bo_data );
             bo_ij->BO = BO;
@@ -102,25 +105,27 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             bo_ij->BO_pi2 = BO_pi2;
 
             /* Only dln_BOp_xx wrt. dr_i is stored here, note that
-            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-            rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
-            rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
+             * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+            rvec_Scale(bo_ij->dln_BOp_s,
+                    -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
+            rvec_Scale(bo_ij->dln_BOp_pi,
+                    -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
             rvec_Scale(bo_ij->dln_BOp_pi2,
-                       -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
+                    -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
 
             /* Only dBOp wrt. dr_i is stored here, note that
-            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-            rvec_Scale( bo_ij->dBOp,
-                        -(bo_ij->BO_s * Cln_BOp_s +
-                          bo_ij->BO_pi * Cln_BOp_pi +
-                          bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+             * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+            rvec_Scale( bo_ij->dBOp, -(bo_ij->BO_s * Cln_BOp_s +
+                        bo_ij->BO_pi * Cln_BOp_pi + bo_ij->BO_pi2 *
+                        Cln_BOp_pi2), ibond->dvec );
 
             rvec_Add( dDeltap_self[i], bo_ij->dBOp );
 
             bo_ij->BO_s -= bo_cut;
             bo_ij->BO -= bo_cut;
 
-            total_bond_order[i] += bo_ij->BO; //currently total_BOp
+            //currently total_BOp
+            total_bond_order[i] += bo_ij->BO; 
 
             bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
 
@@ -133,13 +138,10 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             rvec_MakeZero (ibond->ta_f);
             rvec_MakeZero (ibond->hb_f);
             rvec_MakeZero (ibond->tf_f);
-
-
         }
         else
         {
-
-            //   //btop_j = End_Index( j, bonds );
+            //btop_j = End_Index( j, bonds );
             btop_j = btop_i;
 
             jbond = &(bonds.select.bond_list[btop_j]);
@@ -152,7 +154,7 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             //jbond->dbond_index = btop_i;
             //jbond->sym_index = btop_i;
 
-            //Set_End_Index( j, btop_j+1, bonds );
+            //Set_End_Index( j, btop_j + 1, bonds );
 
             bo_ji = &( jbond->bo_data );
             bo_ji->BO = BO;
@@ -166,17 +168,16 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             rvec_Scale(bo_ij_dln_BOp_s, -BO_s * Cln_BOp_s, nbr_pj->dvec);
             rvec_Scale(bo_ij_dln_BOp_pi, -BO_pi * Cln_BOp_pi, nbr_pj->dvec);
             rvec_Scale(bo_ij_dln_BOp_pi2,
-                       -BO_pi2 * Cln_BOp_pi2, nbr_pj->dvec);
+                    -BO_pi2 * Cln_BOp_pi2, nbr_pj->dvec);
             rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij_dln_BOp_s);
             rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij_dln_BOp_pi );
             rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij_dln_BOp_pi2 );
 
             /* Only dBOp wrt. dr_i is stored here, note that
             dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-            rvec_Scale( bo_ij_dBOp,
-                        -(BO_s * Cln_BOp_s +
-                          BO_pi * Cln_BOp_pi +
-                          BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
+            rvec_Scale( bo_ij_dBOp, -(BO_s * Cln_BOp_s +
+                        BO_pi * Cln_BOp_pi +
+                        BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
             rvec_Scale( bo_ji->dBOp, -1., bo_ij_dBOp );
 
             rvec_Add( dDeltap_self[i], bo_ji->dBOp );
@@ -193,15 +194,15 @@ CUDA_DEVICE inline int Dev_BOp (    reax_list bonds, real bo_cut,
             rvec_MakeZero (jbond->va_f);
             jbond->ta_CdDelta = 0;
             jbond->ta_Cdbo = 0;
-            rvec_MakeZero (jbond->ta_f);
-            rvec_MakeZero (jbond->hb_f);
-            rvec_MakeZero (jbond->tf_f);
+            rvec_MakeZero( jbond->ta_f );
+            rvec_MakeZero( jbond->hb_f );
+            rvec_MakeZero( jbond->tf_f );
         }
 
-        return 1;
+        return TRUE;
     }
 
-    return 0;
+    return FALSE;
 }
 
 #endif
diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
similarity index 86%
rename from PG-PuReMD/src/cuda_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_bonds.cu
index 90f1480ba468fe4b8e79960dc23f9183a196606e..a9c3ce30eeb740c66c2fd5e5fb39cc378258a0d1 100644
--- a/PG-PuReMD/src/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -19,23 +19,19 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-#include "index_utils.h"
-#include "reax_types.h"
-#include "dev_list.h"
-
-
-CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, 
-        global_parameters gp, 
-        single_body_parameters *sbp, 
-        two_body_parameters *tbp, 
-        storage p_workspace, 
-        reax_list p_bonds, 
-        int n, int num_atom_types, 
-        real *e_bond
-        )
+#include "cuda_bonds.h"
+
+#include "cuda_list.h"
+
+#include "../index_utils.h"
+
+
+CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        storage p_workspace, reax_list p_bonds, int n, int num_atom_types, 
+        real *e_bond )
 {
-    int i, j, pj, natoms;
+    int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
     real ebond, pow_BOs_be2, exp_be12, CEbo;
@@ -49,7 +45,11 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
     storage *workspace;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if ( i >= n )
+    {
+        return;
+    }
 
     bonds = &( p_bonds);
     workspace = &( p_workspace );
@@ -59,14 +59,15 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
     gp10 = gp.l[10];
     gp37 = (int) gp.l[37];
 
-    //for( i = 0; i < natoms; ++i ) {
     start_i = Dev_Start_Index(i, bonds);
     end_i = Dev_End_Index(i, bonds);
 
-    for( pj = start_i; pj < end_i; ++pj ) {
+    for ( pj = start_i; pj < end_i; ++pj )
+    {
         j = bonds->select.bond_list[pj].nbr;
 
-        if( my_atoms[i].orig_id <= my_atoms[j].orig_id ) {
+        if ( my_atoms[i].orig_id <= my_atoms[j].orig_id )
+        {
             /* set the pointers */
             type_i = my_atoms[i].type;
             type_j = my_atoms[j].type;
@@ -83,10 +84,10 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
                 ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
 
             /* calculate the Bond Energy */
-            e_bond[ i ] += ebond = 
-                -twbp->De_s * bo_ij->BO_s * exp_be12 
+            ebond = -twbp->De_s * bo_ij->BO_s * exp_be12 
                 -twbp->De_p * bo_ij->BO_pi 
                 -twbp->De_pp * bo_ij->BO_pi2;
+            e_bond[ i ] += ebond;
 
             /* calculate derivatives of Bond Orders */
             bo_ij->Cdbo += CEbo;
@@ -100,17 +101,21 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
                     system->my_atoms[j].orig_id, 
                     bo_ij->BO, ebond, data->my_en.e_bond );
 #endif
+
 #ifdef TEST_FORCES
             Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
             Add_dBOpinpi2( system, lists, i, pj, 
                     -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
                     workspace->f_be, workspace->f_be );
 #endif
+
             /* Stabilisation terminal triple bond */
-            if( bo_ij->BO >= 1.00 ) {
-                if( gp37 == 2 ||
+            if ( bo_ij->BO >= 1.00 )
+            {
+                if ( gp37 == 2 ||
                         (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) )
+                {
                     exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
                     exphua1 = EXP(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO));
                     exphub1 = EXP(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO));
@@ -118,7 +123,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
                     hulpov = 1.0 / (1.0 + 25.0 * exphuov);
 
                     estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                    e_bond [i] += estriph;
+                    e_bond[i] += estriph;
 
                     decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
                         ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
@@ -130,12 +135,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
                     bo_ij->Cdbo += decobdbo;
                     workspace->CdDelta[i] += decobdboua;
                     workspace->CdDelta[j] += decobdboub;
+
 #ifdef TEST_ENERGY
                     //fprintf( out_control->ebond, 
                     //  "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
                     //  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
                     //  estriph, decobdbo, decobdboua, decobdboub );
 #endif
+
 #ifdef TEST_FORCES
                     Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
                     Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
@@ -145,5 +152,4 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms,
             }
         }
     }
-    //  }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_bonds.h b/PG-PuReMD/src/cuda/cuda_bonds.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd9126bee432f48b13de67f7680468907ff484ea
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_bonds.h
@@ -0,0 +1,33 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_BONDS_H_
+#define __CUDA_BONDS_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Bonds( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real * );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_box.cu b/PG-PuReMD/src/cuda/cuda_box.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2d5f47566f2543a21f6cf4598f30b527913a1bf6
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_box.cu
@@ -0,0 +1,118 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_box.h"
+
+#include "cuda_integrate.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+
+#include "../box.h"
+#include "../comm_tools.h"
+
+
+void Cuda_Scale_Box( reax_system *system, control_params *control,
+        simulation_data *data, mpi_datatypes *mpi_data )
+{
+    int d;
+    real dt, lambda;
+    rvec mu = {0.0, 0.0, 0.0};
+
+    dt = control->dt;
+
+    /* pressure scaler */
+    if ( control->ensemble == iNPT )
+    {
+        mu[0] = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
+                1.0 / 3.0 );
+
+        if ( mu[0] < MIN_dV )
+        {
+            mu[0] = MIN_dV;
+        }
+        else if ( mu[0] > MAX_dV )
+        {
+            mu[0] = MAX_dV;
+        }
+
+        mu[1] = mu[0];
+        mu[2] = mu[1];
+    }
+    else if ( control->ensemble == sNPT )
+    {
+        for ( d = 0; d < 3; ++d )
+        {
+            mu[d] = POW(1.0 + (dt / control->Tau_P[d]) * (data->tot_press[d] - control->P[d]),
+                        1. / 3 );
+
+            if ( mu[d] < MIN_dV )
+            {
+                mu[d] = MIN_dV;
+            }
+            else if ( mu[d] > MAX_dV )
+            {
+                mu[d] = MAX_dV;
+            }
+        }
+    }
+
+    /* temperature scaler */
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if ( lambda < MIN_dT )
+    {
+        lambda = MIN_dT;
+    }
+    else if (lambda > MAX_dT )
+    {
+        lambda = MAX_dT;
+    }
+    lambda = SQRT( lambda );
+
+    /* Scale velocities and positions at t+dt */
+    bNVP_scale_velocities( system, lambda, mu );
+
+    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG)
+    fprintf( stderr, "damping - " );
+#endif
+
+    /* update box & grid */
+    system->big_box.box[0][0] *= mu[0];
+    system->big_box.box[1][1] *= mu[1];
+    system->big_box.box[2][2] *= mu[2];
+
+    Make_Consistent( &(system->big_box) );
+    Setup_My_Box( system, control );
+    Setup_My_Ext_Box( system, control );
+    Update_Comm( system );
+
+    copy_host_device( &system->big_box, &system->d_big_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->big_box" );
+    copy_host_device( &system->my_box, &system->d_my_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_box" );
+    copy_host_device( &system->my_ext_box, &system->d_my_ext_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_ext_box" );
+
+#if defined(DEBUG)
+    fprintf( stderr, "box & grid updated - " );
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_box.h
similarity index 81%
rename from PG-PuReMD/src/cuda_integrate.h
rename to PG-PuReMD/src/cuda/cuda_box.h
index 3d1d0685e5de732ca0d1cf9b3f0e5d19de5f6ca4..6db597385884cbaf4efe81bb83674c5ab2e48786 100644
--- a/PG-PuReMD/src/cuda_integrate.h
+++ b/PG-PuReMD/src/cuda/cuda_box.h
@@ -19,21 +19,22 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __INTEGRATE_H_
-#define __INTEGRATE_H_
+#ifndef __CUDA_BOX_H__
+#define __CUDA_BOX_H__
+
+#include "../reax_types.h"
 
-#include "reax_types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void bNVT_update_velocity_part1 (reax_system *, real );
-void bNVT_update_velocity_part2 (reax_system *, real );
-void bNVT_scale_velocities (reax_system *, real );
+void Cuda_Scale_Box( reax_system *, control_params *,
+        simulation_data *, mpi_datatypes *);
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8452548d07beffee1a2958980f21fc1a6516fcf4
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -0,0 +1,296 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_charges.h"
+
+#include "cuda_lin_alg.h"
+#include "cuda_reduction.h"
+#include "cuda_utils.h"
+#include "cuda_validation.h"
+
+#include "../basic_comm.h"
+
+
+CUDA_GLOBAL void k_init_matvec( reax_atom *my_atoms, single_body_parameters
+        *sbp, storage p_workspace, int n  )
+{
+    storage *workspace;
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &( my_atoms[i] );
+
+    /* init pre-conditioner for H and init solution vectors */
+    workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta;
+    workspace->b_s[i] = -sbp[ atom->type ].chi;
+    workspace->b_t[i] = -1.0;
+    workspace->b[i][0] = -sbp[ atom->type ].chi;
+    workspace->b[i][1] = -1.0;
+
+    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+
+    /* cubic extrapolation for s and t */
+    workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]);
+    //}
+}
+
+
+void Cuda_Init_MatVec( reax_system *system, storage *workspace )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+    k_init_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+          *dev_workspace, system->n );
+    cudaThreadSynchronize();
+    cudaCheckError();
+}
+
+
+void cuda_charges_x( reax_system *system, rvec2 my_sum )
+{
+    int blocks;
+    rvec2 *output = (rvec2 *) scratch;
+
+    cuda_memset( output, 0, sizeof(rvec2) * 2 * system->n, "cuda_charges_x:q" );
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( dev_workspace->x, output, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        ( output, output + system->n, blocks );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( my_sum, output + system->n,
+            sizeof(rvec2), cudaMemcpyDeviceToHost, "charges:x" );
+}
+
+
+CUDA_GLOBAL void k_calculate_st( reax_atom *my_atoms, storage p_workspace, 
+        real u, real *q, int n )
+{
+    storage *workspace;
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &( my_atoms[i] );
+
+    //atom->q = workspace->s[i] - u * workspace->t[i];
+    q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
+
+    atom->s[3] = atom->s[2];
+    atom->s[2] = atom->s[1];
+    atom->s[1] = atom->s[0];
+    //atom->s[0] = workspace->s[i];
+    atom->s[0] = workspace->x[i][0];
+
+    atom->t[3] = atom->t[2];
+    atom->t[2] = atom->t[1];
+    atom->t[1] = atom->t[0];
+    //atom->t[0] = workspace->t[i];
+    atom->t[0] = workspace->x[i][1];
+    //}
+}
+//TODO if we use the function argument (output), we are getting 
+//TODO Address not mapped/Invalid permissions error with segmentation fault
+//TODO so using the local argument, which is a global variable anyways. 
+//TODO NEED TO INVESTIGATE MORE ON THIS ISSUE
+//TODO
+//TODO
+//TODO
+
+
+extern "C" void cuda_charges_st( reax_system *system, storage *workspace,
+        real *output, real u )
+{
+    int blocks;
+    real *tmp = (real *) scratch;
+    real *tmp_output = (real *) host_scratch;
+
+    cuda_memset( tmp, 0, sizeof (real) * system->n, "charges:q" );
+    memset( tmp_output, 0, sizeof (real) * system->n );
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+    k_calculate_st <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, *dev_workspace, u, tmp, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( output, tmp, sizeof (real) * system->n, 
+            cudaMemcpyDeviceToHost, "charges:q" );
+}
+//TODO
+//TODO
+//TODO
+//TODO
+//TODO
+//TODO
+//TODO
+
+
+CUDA_GLOBAL void k_update_q( reax_atom *my_atoms, real *q, int n, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= (N - n) )
+    {
+        return;
+    }
+
+    my_atoms[i + n].q = q[i + n];
+}
+
+
+void cuda_charges_updateq( reax_system *system, real *q )
+{
+    int blocks;
+    real *dev_q = (real *) scratch;
+
+    copy_host_device( q, dev_q, system->N * sizeof (real),
+            cudaMemcpyHostToDevice, "charges:q" );
+    blocks = (system->N - system->n) / DEF_BLOCK_SIZE +
+        (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+    k_update_q <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, dev_q, system->n, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
+        mpi_datatypes *mpi_data )
+{
+    int scale;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
+    real *q;
+
+    my_sum[0] = 0.0;
+    my_sum[1] = 0.0;
+    scale = sizeof(real) / sizeof(void);
+    q = (real *) host_scratch;
+    memset( q, 0, system->N * sizeof(real) );
+
+    cuda_charges_x( system, my_sum );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n",
+            my_sum[0], my_sum[1] );
+#endif
+
+    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+
+    u = all_sum[0] / all_sum[1];
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: u: %f \n", u );
+#endif
+
+    cuda_charges_st( system, workspace, q, u );
+
+    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
+
+    cuda_charges_updateq( system, q );
+}
+
+
+void Cuda_QEq( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    //if (data->step > 0) {
+    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
+    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
+    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+    //}
+
+//#ifdef __CUDA_DEBUG__
+//  Init_MatVec( system, data, control, workspace, mpi_data );
+//#endif
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
+    //Print_Linear_System( system, control, workspace, data->step );
+#endif
+
+    //MATRIX CHANGES
+    s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H,
+            dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data,
+            out_control->log, data );
+    t_matvecs = 0;
+    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
+#endif
+
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
+    //Print_Charges( system );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_bonds.h b/PG-PuReMD/src/cuda/cuda_charges.h
similarity index 65%
rename from PG-PuReMD/src/cuda_bonds.h
rename to PG-PuReMD/src/cuda/cuda_charges.h
index d8a7d273c80559a89f5fc5ba63637806add8ea72..d1922a48b83a6df1b27a14efb38e63d153f89111 100644
--- a/PG-PuReMD/src/cuda_bonds.h
+++ b/PG-PuReMD/src/cuda/cuda_charges.h
@@ -19,19 +19,30 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __CUDA_BONDS_H_
-#define __CUDA_BONDS_H_
-
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Bonds(    reax_atom *,
-                                global_parameters ,
-                                single_body_parameters *,
-                                two_body_parameters *,
-                                storage ,
-                                reax_list ,
-                                int , int ,
-                                real *
-                           );
+#ifndef __CUDA_CHARGES_H_
+#define __CUDA_CHARGES_H_
+
+#include "../reax_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void Cuda_Init_MatVec( reax_system *, storage * );
+
+void cuda_charges_x( reax_system *, rvec2 );
+
+void cuda_charges_st( reax_system *, storage *, real *, real );
+
+void cuda_charges_updateq( reax_system *, real * );
+
+void Cuda_QEq( reax_system*, control_params*, simulation_data*,
+        storage*, output_controls*, mpi_datatypes* );
+
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4912c42a410af43e364ccec4e746f6ce375ef27b
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -0,0 +1,181 @@
+
+#include "cuda_copy.h"
+
+#include "cuda_utils.h"
+
+#include "../list.h"
+#include "../vector.h"
+
+
+/* Copy grid info from host to device */
+void Sync_Grid( grid *host, grid *device )
+{
+    int total;
+
+    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+    ivec_Copy( device->ncells, host->ncells);
+    rvec_Copy( device->cell_len, host->cell_len);
+    rvec_Copy( device->inv_len, host->inv_len);
+
+    ivec_Copy( device->bond_span, host->bond_span );
+    ivec_Copy( device->nonb_span, host->nonb_span );
+    ivec_Copy( device->vlist_span, host->vlist_span );
+
+    ivec_Copy( device->native_cells, host->native_cells );
+    ivec_Copy( device->native_str, host->native_str );
+    ivec_Copy( device->native_end, host->native_end );
+
+    device->ghost_cut = host->ghost_cut;
+    ivec_Copy( device->ghost_span, host->ghost_span );
+    ivec_Copy( device->ghost_nonb_span, host->ghost_nonb_span );
+    ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span );
+    ivec_Copy( device->ghost_bond_span, host->ghost_bond_span );
+
+    copy_host_device( host->str, device->str, sizeof(int) * total,
+            cudaMemcpyHostToDevice, "grid:str" );
+    copy_host_device( host->end, device->end, sizeof(int) * total,
+            cudaMemcpyHostToDevice, "grid:end" );
+    copy_host_device( host->cutoff, device->cutoff, sizeof(real) * total,
+            cudaMemcpyHostToDevice, "grid:cutoff" );
+    copy_host_device( host->nbrs_x, device->nbrs_x, sizeof(ivec) * total *
+            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x" );
+    copy_host_device( host->nbrs_cp, device->nbrs_cp, sizeof(rvec) * total *
+            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp" );
+
+    copy_host_device( host->rel_box, device->rel_box, sizeof(ivec) * total,
+            cudaMemcpyHostToDevice, "grid:rel_box" );
+
+    device->max_nbrs = host->max_nbrs;
+}
+
+
+/* Copy atom info from host to device */
+void Sync_Atoms( reax_system *sys )
+{
+    //TODO METIN FIX, coredump on his machine
+//    copy_host_device( sys->my_atoms, sys->d_my_atoms, sizeof(reax_atom) * sys->total_cap,
+//            cudaMemcpyHostToDevice, "Sync_Atoms::system->my_atoms" );
+
+#if defined(__CUDA_DEBUG_LOG__)
+    fprintf( stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", 
+            sys->my_rank, sys->n, sys->N, sys->total_cap );
+#endif
+
+    copy_host_device( sys->my_atoms, sys->d_my_atoms, sizeof(reax_atom) * sys->N,
+            cudaMemcpyHostToDevice, "Sync_Atoms::system->my_atoms" );
+    //TODO METIN FIX, coredump on his machine
+}
+
+
+/* Copy atomic system info from host to device */
+void Sync_System( reax_system *sys )
+{
+    Sync_Atoms( sys );
+
+    copy_host_device( &sys->my_box, sys->d_my_box, sizeof(simulation_box),
+            cudaMemcpyHostToDevice, "Sync_System::system->my_box" );
+
+    copy_host_device( &sys->my_ext_box, sys->d_my_ext_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice,
+            "Sync_System::system->my_ext_box" );
+
+    copy_host_device( sys->reax_param.sbp, sys->reax_param.d_sbp,
+            sizeof(single_body_parameters) * sys->reax_param.num_atom_types,
+            cudaMemcpyHostToDevice, "Sync_System::system->sbp" );
+    copy_host_device( sys->reax_param.tbp, sys->reax_param.d_tbp,
+            sizeof(two_body_parameters) * POW(sys->reax_param.num_atom_types, 2),
+            cudaMemcpyHostToDevice, "Sync_System::system->tbp" );
+    copy_host_device( sys->reax_param.thbp, sys->reax_param.d_thbp,
+            sizeof(three_body_header) * POW(sys->reax_param.num_atom_types, 3),
+            cudaMemcpyHostToDevice, "Sync_System::system->thbh" );
+    copy_host_device( sys->reax_param.hbp, sys->reax_param.d_hbp,
+            sizeof(hbond_parameters) * POW(sys->reax_param.num_atom_types, 3),
+            cudaMemcpyHostToDevice, "Sync_System::system->hbond" );
+    copy_host_device( sys->reax_param.fbp, sys->reax_param.d_fbp, 
+            sizeof(four_body_header) * POW(sys->reax_param.num_atom_types, 4),
+            cudaMemcpyHostToDevice, "Sync_System::system->four_header" );
+
+    copy_host_device( sys->reax_param.gp.l, sys->reax_param.d_gp.l,
+            sizeof(real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice,
+            "Sync_System::system->global_parameters" );
+
+    sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; 
+    sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; 
+}
+
+
+/* Copy atom info from device to host */
+void Output_Sync_Atoms( reax_system *sys )
+{
+    copy_host_device( sys->my_atoms, sys->d_my_atoms, sizeof(reax_atom) *
+            sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms" );
+}
+
+
+/* Copy simulation data from device to host */
+void Output_Sync_Simulation_Data( simulation_data *host, simulation_data *dev )
+{
+    copy_host_device( &host->my_en, &dev->my_en, sizeof(energy_data), 
+            cudaMemcpyDeviceToHost, "simulation_data:energy_data" );
+    copy_host_device( &host->kin_press, &dev->kin_press, sizeof(real), 
+            cudaMemcpyDeviceToHost, "simulation_data:kin_press" );
+    copy_host_device( host->int_press, dev->int_press, sizeof(rvec), 
+            cudaMemcpyDeviceToHost, "simulation_data:int_press" );
+    copy_host_device( host->ext_press, dev->ext_press, sizeof(rvec), 
+            cudaMemcpyDeviceToHost, "simulation_data:ext_press" );
+}
+
+
+/* Copy interaction lists from device to host */
+void Output_Sync_Lists( reax_list *host, reax_list *device, int type )
+{
+#if defined(DEBUG)
+    fprintf( stderr, " Trying to copy *%d* list from device to host \n", type );
+#endif
+
+    if ( host->allocated == TRUE )
+    {
+        Delete_List( host );
+    }
+    Make_List( device->n, device->num_intrs, type, host );
+
+    copy_host_device( host->index, device->index, sizeof(int) * device->n,
+            cudaMemcpyDeviceToHost, "Output_Sync_Lists::list->index" );
+    copy_host_device( host->end_index, device->end_index, sizeof(int) *
+            device->n, cudaMemcpyDeviceToHost, "Output_Sync_Lists::list->end_index" );
+
+    switch ( type )
+    {   
+        case TYP_FAR_NEIGHBOR:
+            copy_host_device( host->select.far_nbr_list, device->select.far_nbr_list,
+                    sizeof(far_neighbor_data) * device->num_intrs,
+                    cudaMemcpyDeviceToHost, "Output_Sync_Lists::far_neighbor_list" );
+            break;
+
+        case TYP_BOND:
+            copy_host_device( host->select.bond_list, device->select.bond_list,
+                    sizeof(bond_data) * device->num_intrs,
+                    cudaMemcpyDeviceToHost, "Output_Sync_Lists::bond_list" );
+            break;
+
+        case TYP_HBOND:
+            copy_host_device( host->select.hbond_list, device->select.hbond_list,
+                    sizeof(hbond_data) * device->num_intrs,
+                    cudaMemcpyDeviceToHost, "Output_Sync_Lists::hbond_list" );
+            break;
+
+        case TYP_THREE_BODY:
+            copy_host_device( host->select.three_body_list,
+                    device->select.three_body_list,
+                    sizeof(three_body_interaction_data )* device->num_intrs,
+                    cudaMemcpyDeviceToHost, "Output_Sync_Lists::three_body_list" );
+            break;
+
+        default:
+            fprintf( stderr, "Unknown list synching from device to host ---- > %d \n",
+                    type );
+            exit( INVALID_INPUT );
+            break;
+    }  
+}
diff --git a/PG-PuReMD/src/cuda/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..72bf992c581950a95ea3d2fa1c7684a6f4fe1c06
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_copy.h
@@ -0,0 +1,30 @@
+#ifndef __CUDA_COPY_H_
+#define __CUDA_COPY_H_
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void Sync_Atoms( reax_system * );
+
+void Sync_Grid( grid *, grid * );
+
+void Sync_System( reax_system * );
+
+void Prep_Device_For_Output( reax_system *, simulation_data * );
+
+void Output_Sync_Lists( reax_list *host, reax_list *device, int type );
+
+void Output_Sync_Atoms( reax_system * );
+
+void Output_Sync_Simulation_Data( simulation_data *, simulation_data * );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
similarity index 54%
rename from PG-PuReMD/src/cuda_environment.cu
rename to PG-PuReMD/src/cuda/cuda_environment.cu
index be6101a21af4af5d6f1688ad023a5aaa330263a9..c35bad7c5d73c6547f991222b7c8f8b6c1c15747 100644
--- a/PG-PuReMD/src/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda/cuda_environment.cu
@@ -9,21 +9,27 @@ extern "C" void Setup_Cuda_Environment(int rank, int nprocs, int gpus_per_node)
     int deviceCount;
     cudaError_t flag;
     
-    flag = cudaGetDeviceCount(&deviceCount);
+    flag = cudaGetDeviceCount( &deviceCount );
 
-    if ( flag != cudaSuccess )
+    if ( flag != cudaSuccess || deviceCount < 1 )
     {
-        fprintf( stderr, "ERROR: no CUDA capable device(s) found. Terminating...\n" );
+        fprintf( stderr, "[ERROR] no CUDA capable device(s) found. Terminating...\n" );
         exit( CANNOT_INITIALIZE );
     }
+    else if ( deviceCount < gpus_per_node || gpus_per_node < 1 )
+    {
+        fprintf( stderr, "[ERROR] invalid number of CUDA capable devices requested (gpus_per_node = %d). Terminating...\n",
+                gpus_per_node );
+        exit( INVALID_INPUT );
+    }
 
     //Calculate the # of GPUs per processor
     //and assign the GPU for each process
     //TODO: handle condition where # CPU procs > # GPUs
-    cudaSetDevice( (rank % (deviceCount)) );
+    cudaSetDevice( rank % gpus_per_node );
 
 #if defined(__CUDA_DEBUG__)
-    fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount));
+    fprintf( stderr, "p:%d is using GPU: %d \n", rank, rank % gpus_per_node );
 #endif
 
     //CHANGE ORIGINAL
diff --git a/PG-PuReMD/src/cuda_environment.h b/PG-PuReMD/src/cuda/cuda_environment.h
similarity index 56%
rename from PG-PuReMD/src/cuda_environment.h
rename to PG-PuReMD/src/cuda/cuda_environment.h
index f8ae3cd0024b6585e32cf34c46d889d84693b806..1cbcc92c5d29bfa28649bfd9e9815ba332d4cbe8 100644
--- a/PG-PuReMD/src/cuda_environment.h
+++ b/PG-PuReMD/src/cuda/cuda_environment.h
@@ -2,15 +2,19 @@
 #ifndef __CUDA_ENVIRONMENT_H__
 #define __CUDA_ENVIRONMENT_H__
 
+#include "../reax_types.h"
+
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-void Setup_Cuda_Environment (int, int, int);
-void Cleanup_Cuda_Environment ();
+void Setup_Cuda_Environment( int, int, int );
+void Cleanup_Cuda_Environment( );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
new file mode 100644
index 0000000000000000000000000000000000000000..28982bffa1d0ab72cbc80032d790798fd95ba1fb
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -0,0 +1,1911 @@
+
+#include "cuda_forces.h"
+
+#include "cuda_bonds.h"
+#include "cuda_bond_orders.h"
+#include "cuda_charges.h"
+#include "cuda_helpers.h"
+#include "cuda_hydrogen_bonds.h"
+#include "cuda_lin_alg.h"
+#include "cuda_list.h"
+#include "cuda_multi_body.h"
+#include "cuda_neighbors.h"
+#include "cuda_nonbonded.h"
+#include "cuda_reduction.h"
+#include "cuda_torsion_angles.h"
+#include "cuda_utils.h"
+#include "cuda_valence_angles.h"
+#include "cuda_validation.h"
+
+#include "../basic_comm.h"
+#include "../forces.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
+
+
+CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
+{
+    control->hbond_cut = 0.0;
+}
+
+
+CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    end_indices[i] = indices[i] + intr_cnt[i];
+}
+
+
+CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    my_atoms[i].Hindex = i;
+}
+
+
+CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
+        int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
+{
+    int i, hindex, my_hbonds;
+
+    i = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    hindex = atoms[i].Hindex;
+
+    if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || 
+            sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
+    {
+        my_hbonds = hbonds[i];
+        indices[hindex] = max_hbonds[i];
+        end_indices[hindex] = indices[hindex] + hbonds[i];
+    }
+    else
+    {
+        my_hbonds = 0;
+        indices[hindex] = 0;
+        end_indices[hindex] = 0;
+    }
+    atoms[i].num_hbonds = my_hbonds;
+}
+
+
+/* Initialize indices for far neighbors list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_Neighbor_Indices( reax_system *system )
+{
+    int blocks;
+    reax_list *far_nbrs = *dev_lists + FAR_NBRS;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for far hydrogen bonds list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_HBond_Indices( reax_system *system )
+{
+    int blocks;
+    int *temp;
+    reax_list *hbonds = *dev_lists + HBONDS;
+
+    temp = (int *) scratch;
+
+    /* init Hindices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* init indices and end_indices */
+    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );
+
+    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
+          hbonds->index, hbonds->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for far bonds list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_Bond_Indices( reax_system *system )
+{
+    int blocks;
+    reax_list *bonds = *dev_lists + BONDS;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_bonds, bonds->index, bonds->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for charge matrix post reallocation
+ *
+ * system: atomic system info.
+ * H: charge matrix */
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
+{
+    int blocks;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->N );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_cm_entries, H->start, H->end, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for three body list post reallocation
+ *
+ * indices: list indices
+ * entries: num. of entries in list */
+void Cuda_Init_Three_Body_Indices( int *indices, int entries )
+{
+    reax_list *thbody = *dev_lists + THREE_BODIES;
+
+    Cuda_Scan_Excl_Sum( indices, thbody->index, entries );
+}
+
+
+CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, 
+        single_body_parameters *sbp, two_body_parameters *tbp,
+        control_params *control, reax_list far_nbrs, 
+        int num_atom_types, int n, int N, int total_cap,
+        int *cm_entries, int *max_cm_entries,
+        int *bonds, int *max_bonds,
+        int *hbonds, int *max_hbonds )
+{
+    int i, j, pj; 
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    int local;
+    int my_bonds, my_hbonds, my_cm_entries;
+    real cutoff;
+    real r_ij; 
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= total_cap )
+    {
+        return;
+    }
+
+    my_bonds = 0;
+    my_hbonds = 0;
+    my_cm_entries = 0;
+
+    if ( i < N )
+    {
+        atom_i = &(my_atoms[i]);
+        type_i = atom_i->type;
+        start_i = Dev_Start_Index( i, &far_nbrs );
+        end_i = Dev_End_Index( i, &far_nbrs );
+        sbp_i = &(sbp[type_i]);
+
+        if ( i < n )
+        { 
+            local = TRUE;
+            cutoff = control->nonb_cut;
+            ++my_cm_entries;
+//            ihb = sbp_i->p_hbond;
+        }   
+        else
+        {
+            local = FALSE;
+            cutoff = control->bond_cut;
+//            ihb = NON_H_BONDING_ATOM; 
+        } 
+
+        ihb = NON_H_BONDING_ATOM; 
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        { 
+            nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(my_atoms[j]);
+
+            if ( nbr_pj->d <= control->nonb_cut )
+            {
+                type_j = my_atoms[j].type;
+                sbp_j = &(sbp[type_j]);
+                ihb = sbp_i->p_hbond;
+                jhb = sbp_j->p_hbond;
+
+                if ( local == TRUE )
+                {
+                    if ( i < j && (j < n || atom_i->orig_id < atom_j->orig_id) )
+                    {
+                        ++my_cm_entries;
+                    }
+                    else if ( i > j && (j < n || atom_j->orig_id > atom_i->orig_id) )
+                    {
+                        ++my_cm_entries;
+                    }
+                }
+                else
+                {
+                    if ( i > j && j < n && atom_j->orig_id < atom_i->orig_id )
+                    {
+                        ++my_cm_entries;
+                    }
+                }
+
+                /* atom i: H bonding, ghost
+                 * atom j: H atom, native */
+                if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut 
+                        && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n )
+                {
+                    ++my_hbonds;
+                }
+
+//                if ( i >= n )
+//                {
+//                    ihb = NON_H_BONDING_ATOM;
+//                }
+            }
+
+            if ( nbr_pj->d <= cutoff )
+            {
+                type_j = my_atoms[j].type;
+                r_ij = nbr_pj->d;
+                sbp_j = &(sbp[type_j]);
+                twbp = &(tbp[ index_tbp(type_i ,type_j, num_atom_types) ]);
+
+                if ( local == TRUE )
+                {
+                    /* atom i: H atom OR H bonding atom, native */
+                    if ( control->hbond_cut > 0.0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
+                            nbr_pj->d <= control->hbond_cut )
+                    {
+                        jhb = sbp_j->p_hbond;
+
+                        /* atom i: H atom, native
+                         * atom j: H bonding atom */
+                        if( ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                        {
+                            ++my_hbonds;
+                        }
+                        /* atom i: H bonding atom, native
+                         * atom j: H atom, native */
+                        else if( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
+                        {
+                            ++my_hbonds;
+                        }
+                    }
+                }
+
+                /* uncorrected bond orders */
+                if ( nbr_pj->d <= control->bond_cut )
+                {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
+                    {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else
+                    {
+                        BO_s = C12 = 0.0;
+                    }
+
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
+                    {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else
+                    {
+                        BO_pi = C34 = 0.0;
+                    }
+
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2= EXP( C56 );
+                    }
+                    else
+                    {
+                        BO_pi2 = C56 = 0.0;
+                    }
+
+                    /* initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+
+                    if ( BO >= control->bo_cut )
+                    {
+                        ++my_bonds;
+                    }
+                }
+            }
+        }
+    }
+
+    bonds[i] = my_bonds;
+    max_bonds[i] = MAX( (int)(my_bonds * 2), MIN_BONDS );
+
+    hbonds[i] = my_hbonds;
+    max_hbonds[i] = MAX( (int)(my_hbonds * SAFE_ZONE), MIN_HBONDS );
+
+    cm_entries[i] = my_cm_entries;
+    max_cm_entries[i] = MAX( (int)(my_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
+}
+
+
+void Cuda_Estimate_Storages( reax_system *system, control_params *control, 
+        reax_list **lists, int realloc_bonds, int realloc_hbonds, int realloc_cm,
+        int step )
+{
+    int blocks;
+
+    blocks = system->total_cap / ST_BLOCK_SIZE + 
+        (((system->total_cap % ST_BLOCK_SIZE == 0)) ? 0 : 1);
+
+    k_estimate_storages <<< blocks, ST_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
+          (control_params *)control->d_control_params,
+          *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
+          system->n, system->N, system->total_cap,
+          system->d_cm_entries, system->d_max_cm_entries,
+          system->d_bonds, system->d_max_bonds,
+          system->d_hbonds, system->d_max_hbonds );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    if ( realloc_bonds == TRUE )
+    {
+        Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
+                system->total_cap );
+        copy_host_device( &(system->total_bonds), system->d_total_bonds, sizeof(int), 
+                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" );
+    }
+
+    if ( system->numH > 0 && control->hbond_cut > 0.0 )
+    {
+        if ( realloc_hbonds == TRUE )
+        {
+            Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
+                    system->total_cap );
+            copy_host_device( &(system->total_hbonds), system->d_total_hbonds, sizeof(int), 
+                    cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
+        }
+    }
+    else
+    {
+        if ( step == 0 )
+        {
+#if defined(DEBUG)
+            if ( system->numH == 0 )
+            {
+                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: NO HYDROGEN ATOMS FOUND\n" );
+            }
+#endif
+
+#if defined(DEBUG)
+            if ( control->hbond_cut <= 0.0 )
+            {
+                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: BOND CUTOFF LENGTH IS ZERO\n" );
+            }
+#endif
+
+            control->hbond_cut = 0.0;
+            k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params );
+        }
+    }
+
+    if ( realloc_cm == TRUE )
+    {
+        Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, system->total_cap );
+        copy_host_device( &(system->total_cm_entries), system->d_total_cm_entries, sizeof(int),
+                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p:%d -->\n", system->my_rank );
+    fprintf( stderr, " TOTAL DEVICE BOND COUNT: %d \n", system->total_bonds );
+    fprintf( stderr, " TOTAL DEVICE HBOND COUNT: %d \n", system->total_hbonds );
+    fprintf( stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", system->total_cm_entries );
+#endif
+}
+
+
+int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *control, 
+        int step, reax_list **lists, int *thbody )
+{
+    int ret;
+
+    ret = SUCCESS;
+
+    cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" );
+
+    Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
+        ( system->d_my_atoms, (control_params *)control->d_control_params, 
+          *(*dev_lists + BONDS), system->n, system->N, thbody );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );
+
+    copy_host_device( &(system->total_thbodies), system->d_total_thbodies, sizeof(int),
+            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" );
+
+    if ( step == 0 )
+    {
+        system->total_thbodies = MAX( (int)(system->total_thbodies * SAFE_ZONE), MIN_3BODIES );
+        system->total_thbodies_indices = system->total_bonds;
+
+        /* create Three-body list */
+        Dev_Make_List( system->total_thbodies_indices, system->total_thbodies,
+                TYP_THREE_BODY, *dev_lists + THREE_BODIES );
+    }
+
+    if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs ||
+            system->total_bonds > (*dev_lists + THREE_BODIES)->n )
+    {
+        if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs )
+        {
+            system->total_thbodies = MAX( (int)((*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE),
+                    system->total_thbodies );
+        }
+        if ( system->total_bonds > (*dev_lists + THREE_BODIES)->n )
+        {
+            system->total_thbodies_indices = MAX( (int)((*dev_lists + THREE_BODIES)->n * SAFE_ZONE),
+                    system->total_bonds );
+        }
+
+        dev_workspace->realloc.thbody = TRUE;
+        ret = FAILURE;
+    }
+
+    return ret;
+}
+
+
+CUDA_DEVICE real Compute_H( real r, real gamma, real *ctap )
+{
+    real taper, dr3gamij_1, dr3gamij_3;
+
+    taper = ctap[7] * r + ctap[6];
+    taper = taper * r + ctap[5];
+    taper = taper * r + ctap[4];
+    taper = taper * r + ctap[3];
+    taper = taper * r + ctap[2];
+    taper = taper * r + ctap[1];
+    taper = taper * r + ctap[0];    
+
+    dr3gamij_1 = r * r * r + gamma;
+    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
+    return taper * EV_to_KCALpMOL / dr3gamij_3;
+}
+
+
+CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, int num_atom_types )
+{
+    int r, tmin, tmax;
+    real val, dif, base;
+    LR_lookup_table *t; 
+
+    tmin = MIN( ti, tj );
+    tmax = MAX( ti, tj );
+    t = &( t_LR[ index_lr(tmin,tmax, num_atom_types) ] );    
+
+    /* cubic spline interpolation */
+    r = (int)(r_ij * t->inv_dx);
+    if ( r == 0 )
+    {
+        ++r;
+    }
+    base = (real)(r + 1) * t->dx;
+    dif = r_ij - base;
+    val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif
+        + t->ele[r].a;
+    val *= EV_to_KCALpMOL / C_ele;
+
+    return val;
+}
+
+
+CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters *sbp, 
+        control_params *control, reax_list hbonds, int N )
+{
+    int i;
+    int type_i;
+    int ihb, ihb_top;
+    single_body_parameters *sbp_i;
+    reax_atom *atom_i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    atom_i = &(my_atoms[i]);
+    type_i = atom_i->type;
+    sbp_i = &(sbp[type_i]);
+
+    if ( control->hbond_cut > 0.0 )
+    {
+        ihb = sbp_i->p_hbond;
+        if ( ihb == H_ATOM  || ihb == H_BONDING_ATOM )
+        {
+            ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds );
+        }
+        else
+        {
+            ihb_top = -1;
+        }
+    }
+
+    printf( "atom %6d: ihb = %2d, ihb_top = %2d\n", i, ihb, ihb_top );
+}
+
+
+CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp, 
+        two_body_parameters *tbp, storage workspace, control_params *control, 
+        reax_list far_nbrs_list, reax_list bonds_list, reax_list hbonds_list, 
+        LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr,
+        int *cm_entries, int *max_cm_entries, int *realloc_cm_entries,
+        int *bonds, int *max_bonds, int *realloc_bonds,
+        int *hbonds, int *max_hbonds, int *realloc_hbonds )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, ihb, jhb, ihb_top;
+    int my_bonds, my_hbonds, my_cm_entries;
+    int local, flag, flag2, flag3;
+    real r_ij, cutoff;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    sparse_matrix *H;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    H = &(workspace.H);
+    Htop = H->start[i];
+
+    atom_i = &(my_atoms[i]);
+    type_i = atom_i->type;
+    start_i = Dev_Start_Index( i, &far_nbrs_list );
+    end_i = Dev_End_Index( i, &far_nbrs_list );
+    btop_i = Dev_Start_Index( i, &bonds_list );
+    sbp_i = &(sbp[type_i]);
+
+    if ( i < n )
+    {
+        local = TRUE;
+        cutoff = control->nonb_cut;
+
+        //update bond mark here
+        workspace.bond_mark[i] = 0;
+    }
+    else
+    {
+        local = FALSE;
+        cutoff = control->bond_cut;
+
+        //update bond mark here
+        workspace.bond_mark[i] = 1000;
+    }
+
+    ihb = NON_H_BONDING_ATOM;
+    ihb_top = -1;
+
+    if ( local == TRUE )
+    {
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = sbp_i->eta;
+        ++Htop;
+    }
+
+    if ( control->hbond_cut > 0.0 )
+    {
+        ihb = sbp_i->p_hbond;
+
+        if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
+        {
+            ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds_list );
+        }
+        else
+        {
+            ihb_top = -1;
+        }
+    }
+
+    /* update i-j distance - check if j is within cutoff */
+    for ( pj = start_i; pj < end_i; ++pj )
+    {
+        nbr_pj = &( far_nbrs_list.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(my_atoms[j]);
+
+        if ( renbr )
+        {
+            if ( nbr_pj->d <= cutoff )
+            {
+                flag = TRUE;
+            }
+            else
+            {
+                flag = FALSE;
+            }
+
+            if ( nbr_pj->d <= control->nonb_cut )
+            {
+                flag2 = TRUE;
+            }
+            else
+            {
+                flag2 = FALSE;
+            }
+
+        }
+        else
+        {
+            if ( i < j )
+            {
+                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
+                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
+                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
+                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
+            }
+            else
+            {
+                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
+                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
+                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
+                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
+            }
+
+            if ( nbr_pj->d <= SQR( control->nonb_cut ) )
+            {
+                flag2 = TRUE;
+            }
+            else
+            {
+                flag2 = FALSE;
+            }
+
+            if ( nbr_pj->d <= SQR( control->nonb_cut ) )
+            {
+                nbr_pj->d = SQRT( nbr_pj->d );
+                flag = TRUE;
+            }
+            else
+            {
+                flag = FALSE;
+            }
+        }
+        if ( flag2 == TRUE )
+        {
+            type_j = atom_j->type;
+            sbp_j = &(sbp[type_j]);
+            ihb = sbp_i->p_hbond;
+            jhb = sbp_j->p_hbond;
+
+            /* atom i: H bonding, ghost
+             * atom j: H atom, native */
+            if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut
+                    && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n ) 
+            {
+                hbonds_list.select.hbond_list[ihb_top].nbr = j;
+                hbonds_list.select.hbond_list[ihb_top].scl = -1;
+                hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                //CUDA SPECIFIC
+                hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
+
+                ++ihb_top;
+            }
+
+            //if ((i < n) || (j < n))
+            //if (local == TRUE || ((i >= n) &&(j < n)))
+
+            flag3 = FALSE;
+            if ( i < j && i < n && (j < n || atom_i->orig_id < atom_j->orig_id) )
+            {
+                flag3 = TRUE;
+            }
+            else if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id )
+            {
+                flag3 = TRUE;
+            }
+            else if ( i > j && i < n && (j < n || atom_j->orig_id < atom_i->orig_id ) )
+            {
+                flag3 = TRUE;
+            }
+
+            if ( flag3 == TRUE )
+            {
+                twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
+                r_ij = nbr_pj->d;
+
+                //if (renbr) {
+                H->entries[Htop].j = j;
+                if ( control->tabulate == 0 )
+                {
+                    H->entries[Htop].val = Compute_H( r_ij,twbp->gamma,workspace.Tap );
+                }
+                else
+                {
+                    H->entries[Htop].val = Compute_tabH( t_LR, r_ij, type_i, type_j,num_atom_types );
+                }
+                //}
+                ++Htop;
+            }
+        }
+
+        if ( flag == TRUE )
+        {
+            type_j = atom_j->type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[ index_tbp(type_i, type_j, num_atom_types) ]);
+
+            if ( local == TRUE )
+            {
+                /* H matrix entry */
+//                if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1
+//                    H->entries[Htop].j = j;
+//                    if( control->tabulate == 0 )
+//                        H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
+//                    else
+//                        H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
+//                    ++Htop;
+//                } 
+//                else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1
+//                    H->entries[Htop].j = j;
+//                    if( control->tabulate == 0 )
+//                        H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
+//                    else
+//                        H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
+//                    ++Htop;
+//                } 
+                //bool condition = !((i >= n) && (j >= n));
+
+                /* hydrogen bond lists */
+                if ( control->hbond_cut > 0.0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
+                        nbr_pj->d <= control->hbond_cut )
+                {
+                    jhb = sbp_j->p_hbond;
+
+                    /* atom i: H atom, native
+                     * atom j: H bonding atom */
+                    if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                    {
+                        hbonds_list.select.hbond_list[ihb_top].nbr = j;
+
+                        if ( i < j )
+                        {
+                            hbonds_list.select.hbond_list[ihb_top].scl = 1;
+                        }
+                        else
+                        {
+                            hbonds_list.select.hbond_list[ihb_top].scl = -1;
+                        }
+                        hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //CUDA SPECIFIC
+                        hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
+
+                        ++ihb_top;
+                    }
+                    /* atom i: H bonding atom, native
+                     * atom j: H atom, native */
+                    else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
+                    {
+                        //jhb_top = End_Index( atom_j->Hindex, hbonds );
+                        hbonds_list.select.hbond_list[ihb_top].nbr = j;
+                        hbonds_list.select.hbond_list[ihb_top].scl = -1;
+                        hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //CUDA SPECIFIC
+                        hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
+
+                        ++ihb_top;
+                    }
+                }
+            }
+
+            /* uncorrected bond orders */
+            if ( nbr_pj->d <= control->bond_cut &&
+                    Dev_BOp( bonds_list, control->bo_cut, i, btop_i, nbr_pj,
+                        sbp_i, sbp_j, twbp, workspace.dDeltap_self,
+                        workspace.total_bond_order ) == TRUE )
+            {
+                ++btop_i;
+
+                /* TODO: Need to do later... since i and j are parallel */
+//                if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+//                {
+//                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+//                }
+//                else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+//                {
+//                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+//                }
+            }
+        }
+    }
+
+    Dev_Set_End_Index( i, btop_i, &bonds_list );
+    H->end[i] = Htop;
+//    if( local == TRUE )
+//    {
+        if ( control->hbond_cut > 0.0 && ihb_top > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) )
+        {
+            Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds_list );
+        }
+//    }
+
+    my_bonds = btop_i - Dev_Start_Index( i, &bonds_list );
+    my_hbonds = ihb_top - Dev_Start_Index( atom_i->Hindex, &hbonds_list );
+    my_cm_entries = Htop - H->start[i];
+
+    /* copy (h)bond info to atom structure
+     * (needed for atom ownership transfer via MPI) */
+    my_atoms[i].num_bonds = my_bonds;
+    my_atoms[i].num_hbonds = my_hbonds;
+
+    /* reallocation checks */
+    if ( my_bonds > max_bonds[i] )
+    {
+        *realloc_bonds = TRUE;
+    }
+
+    if ( my_hbonds > max_hbonds[i] )
+    {
+        *realloc_hbonds = TRUE;
+    }
+
+    if ( my_cm_entries > max_cm_entries[i] )
+    {
+        *realloc_cm_entries = TRUE;
+    }
+}
+
+
+CUDA_GLOBAL void k_init_bond_mark( int offset, int n, int *bond_mark )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n)
+    {
+        return;
+    }
+
+    bond_mark[offset + threadIdx.x] = 1000;
+}
+
+
+CUDA_GLOBAL void New_fix_sym_dbond_indices( reax_list pbonds, int N )
+{
+    int i, j, k, nbr;
+    bond_data *ibond, *jbond;
+    int atom_j;
+    reax_list *bonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    bonds = &pbonds;
+
+    for ( j = Dev_Start_Index(i, bonds); j < Dev_End_Index(i, bonds); j++ )
+    {
+        ibond = &( bonds->select.bond_list[j] );
+        nbr = ibond->nbr;
+
+        for ( k = Dev_Start_Index(nbr, bonds); k < Dev_End_Index(nbr, bonds); k++ )
+        {
+            jbond = &( bonds->select.bond_list[k] );
+            atom_j = jbond->nbr;
+
+            if ( atom_j == i )
+            {
+                if ( i > nbr )
+                {
+                    ibond->dbond_index = j;
+                    jbond->dbond_index = j;
+
+                    ibond->sym_index = k;
+                    jbond->sym_index = j;
+                }
+            }
+        }
+    }
+}
+
+
+CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbonds, int N )
+{
+    int i, j, k;
+    int nbr, nbrstart, nbrend;
+    int start, end;
+    hbond_data *ihbond, *jhbond;
+    int __THREADS_PER_ATOM__;
+    int thread_id;
+    int warp_id;
+    int lane_id;
+
+    __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    warp_id = thread_id / __THREADS_PER_ATOM__;
+
+    if ( warp_id > N )
+    {
+        return;
+    }
+
+    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
+    i = warp_id;
+    start = Dev_Start_Index( my_atoms[i].Hindex, &hbonds );
+    end = Dev_End_Index( my_atoms[i].Hindex, &hbonds );
+    j = start + lane_id;
+
+    while ( j < end )
+    {
+        ihbond = &( hbonds.select.hbond_list[j] );
+        nbr = ihbond->nbr;
+
+        nbrstart = Dev_Start_Index( my_atoms[nbr].Hindex, &hbonds );
+        nbrend = Dev_End_Index( my_atoms[nbr].Hindex, &hbonds );
+
+        for ( k = nbrstart; k < nbrend; k++ )
+        {
+            jhbond = &( hbonds.select.hbond_list[k] );
+
+            if ( jhbond->nbr == i )
+            {
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+        }
+
+        j += __THREADS_PER_ATOM__;
+    }
+}
+
+
+CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n )
+{
+    int i;
+    
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    my_atoms[i].num_bonds = Dev_Num_Entries( i, &bonds );
+}
+
+
+CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
+{
+    int Hindex;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    Hindex = my_atoms[i].Hindex;
+    my_atoms[i].num_hbonds = Dev_Num_Entries( Hindex, &hbonds );
+}
+
+
+#if defined(DEBUG)
+CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
+{
+    int i; 
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    printf( "%8d: %24.15f, %24.15f, %24.15f\n",
+            my_atoms[i].orig_id,
+            f[i][0],
+            f[i][1],
+            f[i][2] );
+}
+
+
+static void Print_Forces( reax_system *system )
+{
+    int blocks;
+    
+    blocks = (system->n) / DEF_BLOCK_SIZE + 
+        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+    k_print_forces <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, dev_workspace->f, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list p_hbonds, int n, int rank, int step )
+{
+    int i, k, pj, start, end; 
+    reax_list *hbonds;
+    hbond_data *hbond_list, *hbond_jk;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    hbonds = &( p_hbonds );
+    start = Dev_Start_Index( my_atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( my_atoms[i].Hindex, hbonds );
+    hbond_list = hbonds->select.hbond_list;
+
+    for ( pj = start; pj < end; ++pj )
+    {
+        k = hbond_list[pj].nbr;
+        hbond_jk = &( hbond_list[pj] );
+
+        printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n",
+                rank, step, my_atoms[i].Hindex, k,
+                hbond_jk->hb_f[0],
+                hbond_jk->hb_f[1],
+                hbond_jk->hb_f[2] );
+    }
+}
+
+
+static void Print_HBonds( reax_system *system, int step )
+{
+    int blocks;
+    
+    blocks = (system->n) / DEF_BLOCK_SIZE + 
+        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+    k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, *(*dev_lists + HBONDS), system->n, system->my_rank, step );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+#endif
+
+
+CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, 
+        reax_list bonds, real *total_bond_order, int N )
+{
+    int i, pj; 
+//    int j; 
+    int start_i, end_i;
+//    far_neighbor_data *nbr_pj;
+//    reax_atom *atom_i, *atom_j;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+//    atom_i = &(my_atoms[i]);
+    start_i = Dev_Start_Index(i, &far_nbrs);
+    end_i = Dev_End_Index(i, &far_nbrs);
+
+    for( pj = start_i; pj < end_i; ++pj )
+    { 
+//        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+//        j = nbr_pj->nbr;
+//        atom_j = &(my_atoms[j]);
+//
+//        total_bond_order[i]++;
+//        atom_i->Hindex++;
+    }
+}
+
+
+CUDA_GLOBAL void k_bond_mark( reax_list p_bonds, storage p_workspace, int N )
+{
+    int i, j, k;
+    reax_list *bonds = &( p_bonds );
+    storage *workspace = &( p_workspace );
+
+//    int i = blockIdx.x * blockDim.x + threadIdx.x;
+//    if ( i >= N )
+//    {
+//        return;
+//    }
+
+    for ( i = 0; i < N; i++ )
+    {
+        for (k = Dev_Start_Index(i, bonds); k < Dev_End_Index(i, bonds); k++)
+        {
+            bond_data *bdata = &( bonds->select.bond_list[k] );
+            j = bdata->nbr;
+
+            if (i < j )
+            {
+                if ( workspace->bond_mark[j] > (workspace->bond_mark[i] + 1) )
+                {
+                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;    
+                }
+                else if ( workspace->bond_mark[i] > (workspace->bond_mark[j] + 1) )
+                {
+                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+                }
+            }
+        }
+    }
+}
+
+
+int Cuda_Init_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control ) 
+{
+    int ret, ret_bonds, ret_hbonds, ret_cm;
+    int blocks, hblocks;
+
+    /* init the workspace (bond_mark) */
+//    cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" );
+//
+//    blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
+//       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+//    k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
+//       ( system->n, (system->N - system->n), dev_workspace->bond_mark );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    /* main kernel */
+    blocks = (system->N) / DEF_BLOCK_SIZE + 
+        (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+//    k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
+//          dev_workspace->total_bond_order, system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( system->d_my_atoms, system->reax_param.d_sbp,
+//          (control_params *)control->d_control_params,
+//          *(*dev_lists + HBONDS), system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    /* reset reallocation flags on device */
+    cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_bonds" );
+    cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_hbonds" );
+    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_cm_entries" );
+
+    k_init_forces <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp,
+          system->reax_param.d_tbp, *dev_workspace,
+          (control_params *)control->d_control_params,
+          *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
+          *(*dev_lists + HBONDS), d_LR, system->n,
+          system->N, system->reax_param.num_atom_types,
+          (((data->step-data->prev_steps) % control->reneighbor) == 0),
+          system->d_cm_entries, system->d_max_cm_entries, system->d_realloc_cm_entries,
+          system->d_bonds, system->d_max_bonds, system->d_realloc_bonds,
+          system->d_hbonds, system->d_max_hbonds, system->d_realloc_hbonds );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* check reallocation flags on device */
+    copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
+    copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
+    copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );
+
+    ret = (ret_bonds == FALSE && ret_hbonds == FALSE && ret_cm == FALSE) ? SUCCESS : FAILURE;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "[INFO] p%d, step %d: ret = %d, ret_bonds = %d, ret_hbonds = %d, ret_cm = %d\n",
+            system->my_rank, data->step, ret, ret_bonds, ret_hbonds, ret_cm );
+#endif
+
+    if ( ret == SUCCESS )
+    {
+        /* fix sym_index and dbond_index */
+        New_fix_sym_dbond_indices <<< blocks, BLOCK_SIZE >>> 
+            ( *(*dev_lists + BONDS), system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        if ( control->hbond_cut > 0 && system->numH > 0 )
+        {
+            /* make hbond_list symmetric */
+            hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM / HB_SYM_BLOCK_SIZE) + 
+                ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);
+
+            New_fix_sym_hbond_indices <<< hblocks, HB_BLOCK_SIZE >>>
+                ( system->d_my_atoms, *(*dev_lists + HBONDS), system->N );
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+        }
+
+        /* update bond_mark */
+//        k_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
+//        k_bond_mark <<< 1, 1 >>>
+//            ( *(*dev_lists + BONDS), *dev_workspace, system->N );
+//        cudaThreadSynchronize( );
+//        cudaCheckError( );
+    }
+    else
+    {
+        Cuda_Estimate_Storages( system, control, dev_lists,
+               ret_bonds, ret_hbonds, ret_cm, data->step );
+
+        dev_workspace->realloc.bonds = ret_bonds;
+        dev_workspace->realloc.hbonds = ret_hbonds;
+        dev_workspace->realloc.cm = ret_cm;
+    }
+
+    return ret;
+}
+
+
+int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control ) 
+{
+    //TODO: implement later when figure out bond_mark usage
+    return FAILURE;
+}
+
+
+int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, 
+        reax_list **lists, output_controls *out_control )
+{
+    int update_energy, ret;
+//    int hbs, hnbrs_blocks;
+    int *thbody;
+    static int compute_bonded_part1 = FALSE;
+    real *spad = (real *) scratch;
+    rvec *rvec_spad;
+#if defined(DEBUG)
+    real t_start, t_elapsed;
+#endif
+
+    update_energy = (out_control->energy_update_freq > 0
+            && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    if ( compute_bonded_part1 == FALSE )
+    {
+        /* 1. Bond Order Interactions */
+#if defined(DEBUG)
+        t_start = Get_Time( );
+
+        fprintf( stderr, " Begin Bonded Forces ... %d x %d\n",
+                BLOCKS_N, BLOCK_SIZE );
+#endif
+
+        Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_sbp, 
+              *dev_workspace, system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
+              system->reax_param.d_tbp, *dev_workspace, 
+              *(*dev_lists + BONDS),
+              system->reax_param.num_atom_types, system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        Cuda_Update_Uncorrected_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( *dev_workspace, *(*dev_lists + BONDS), system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        Cuda_Update_Workspace_After_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
+             *dev_workspace, system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+#if defined(DEBUG)
+        t_elapsed = Get_Timing_Info( t_start );
+
+        fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n",
+                cudaGetLastError( ), t_elapsed );
+        fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
+#endif
+
+        /* 2. Bond Energy Interactions */
+#if defined(DEBUG)
+        t_start = Get_Time( );
+#endif
+
+        cuda_memset( spad, 0, system->N * (2 * sizeof(real)) , "scratch" );
+
+        Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof(real)* BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
+              *dev_workspace, *(*dev_lists + BONDS), 
+              system->n, system->reax_param.num_atom_types, spad );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        /* reduction for E_BE */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
+                    system->n );
+        }
+
+#if defined(DEBUG)
+        t_elapsed = Get_Timing_Info( t_start );
+
+        fprintf( stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n",
+                cudaGetLastError( ), t_elapsed );
+        fprintf( stderr, "Cuda_Bond_Energy Done... \n" );
+#endif
+
+        /* 3. Atom Energy Interactions */
+#if defined(DEBUG)
+        t_start = Get_Time( );
+#endif
+
+        cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" );
+
+        Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp,
+              system->reax_param.d_sbp, system->reax_param.d_tbp, *dev_workspace,
+              *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types,
+              spad, spad + 2 * system->n, spad + 4 * system->n);
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+//        Cuda_Atom_Energy_PostProcess <<< BLOCKS, BLOCK_SIZE >>>
+//            ( *(*dev_lists + BONDS), *dev_workspace, system->n );
+        Cuda_Atom_Energy_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( *(*dev_lists + BONDS), *dev_workspace, system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        /* reduction for E_Lp */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
+                    system->n );
+        }
+
+        /* reduction for E_Ov */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+                    system->n );
+        }
+
+        /* reduction for E_Un */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 4 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+                    system->n );
+        }
+
+#if defined(DEBUG)
+        t_elapsed = Get_Timing_Info( t_start );
+
+        fprintf( stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n",
+                cudaGetLastError( ), t_elapsed );
+        fprintf( stderr, "test_LonePair_postprocess Done... \n");
+#endif
+
+        compute_bonded_part1 = TRUE;
+    }
+
+    /* 4. Valence Angles Interactions */
+#if defined(DEBUG)
+    t_start = Get_Time( );
+#endif
+
+    thbody = (int *) scratch;
+    ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step,
+            dev_lists, thbody );
+
+#if defined(DEBUG)
+    fprintf( stderr, "system->total_thbodies = %d, lists:THREE_BODIES->num_intrs = %d,\n",
+            system->total_thbodies, (*lists + THREE_BODIES)->num_intrs );
+    fprintf( stderr, "lists:THREE_BODIES->n = %d, lists:BONDS->num_intrs = %d,\n",
+            (*lists + THREE_BODIES)->n, (*lists + BONDS)->num_intrs );
+    fprintf( stderr, "system->total_thbodies = %d\n", system->total_thbodies );
+#endif
+
+    if ( ret == SUCCESS )
+    {
+        Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices );
+
+        cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" );
+
+        Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, 
+              system->reax_param.d_sbp, system->reax_param.d_thbp, 
+              (control_params *)control->d_control_params,
+              *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
+              system->n, system->N, system->reax_param.num_atom_types, 
+              spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N) );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        /* reduction for E_Ang */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
+                    system->N );
+        }
+
+        /* reduction for E_Pen */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->N,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
+                    system->N );
+        }
+
+        /* reduction for E_Coa */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 4 * system->N,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
+                    system->N );
+        }
+
+        /* reduction for ext_pres */
+        rvec_spad = (rvec *) (spad + 6 * system->N);
+        k_reduction_rvec <<< BLOCKS_N, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+            ( rvec_spad, rvec_spad + system->N,  system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof(rvec) * BLOCKS_POW_2_N >>>
+            ( rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N );
+        cudaThreadSynchronize ();
+        cudaCheckError( );
+//        Cuda_Reduction_Sum( rvec_spad,
+//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                system->N );
+
+        Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, (control_params *)control->d_control_params,
+              *dev_workspace, *(*dev_lists + BONDS), system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+#if defined(DEBUG)
+        t_elapsed = Get_Timing_Info( t_start );
+
+        fprintf( stderr, "Three_Body_Interactions ...  Timing %lf \n",
+                t_elapsed );
+        fprintf( stderr, "Three_Body_Interactions Done... \n" );
+#endif
+
+        /* 5. Torsion Angles Interactions */
+#if defined(DEBUG)
+        t_start = Get_Time( );
+#endif
+
+        cuda_memset( spad, 0, 4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2,
+                "scratch" );
+
+        Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp,
+              (control_params *) control->d_control_params, *(*dev_lists + BONDS),
+              *(*dev_lists + THREE_BODIES), *dev_workspace, system->n,
+              system->reax_param.num_atom_types, 
+              spad, spad + 2 * system->n, (rvec *) (spad + 4 * system->n) );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        /* reduction for E_Tor */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
+                    system->n );
+        }
+
+        /* reduction for E_Con */
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_con,
+                    system->n );
+        }
+
+        /* reduction for ext_pres */
+        rvec_spad = (rvec *) (spad + 4 * system->n);
+        k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+            ( rvec_spad, rvec_spad + system->n,  system->n );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
+                ( rvec_spad + system->n,
+                  &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+//        Cuda_Reduction_Sum( rvec_spad,
+//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                system->n );
+
+        Cuda_Torsion_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
+                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + BONDS),
+                  system->N );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+#if defined(DEBUG)
+        t_elapsed = Get_Timing_Info( t_start );
+
+        fprintf( stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n",
+                cudaGetLastError( ), t_elapsed );
+        fprintf( stderr, " Four_Body_ Done... \n");
+#endif
+
+        /* 6. Hydrogen Bonds Interactions */
+        if ( control->hbond_cut > 0.0 && system->numH > 0 )
+        {
+#if defined(DEBUG)
+            t_start = Get_Time( );
+#endif
+
+            cuda_memset( spad, 0,
+                    2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" );
+
+//            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
+//                (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
+
+            Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>>
+//            Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, 
+//                    HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>>
+                    ( system->d_my_atoms, system->reax_param.d_sbp,
+                      system->reax_param.d_hbp, system->reax_param.d_gp,
+                      (control_params *) control->d_control_params,
+                      *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + HBONDS),
+                      system->n, system->reax_param.num_atom_types,
+                      spad, (rvec *) (spad + 2 * system->n), system->my_rank, data->step );
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+
+//            if ( data->step == 10 )
+//            {
+//                Print_HBonds( system, data->step );
+//            }
+
+            /* reduction for E_HB */
+            if ( update_energy == TRUE )
+            {
+                Cuda_Reduction_Sum( spad,
+                        &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
+                        system->n );
+            }
+
+            /* reduction for ext_pres */
+            rvec_spad = (rvec *) (spad + 2 * system->n);
+            k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+                (rvec_spad, rvec_spad + system->n,  system->n);
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+
+            k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
+                (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+//            Cuda_Reduction_Sum( rvec_spad,
+//                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                    system->n );
+
+            /* post process step1 */
+            Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof(rvec) >>>
+                ( system->d_my_atoms, *dev_workspace,
+                  *(*dev_lists + BONDS), system->N );
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+
+            /* post process step2 */
+//            hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
+//                (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
+
+            Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>>
+                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) );
+//            Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
+//                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
+//                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS), system->N );
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+
+#if defined(DEBUG)
+            t_elapsed = Get_Timing_Info( t_start );
+
+            fprintf( stderr,
+                    "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n",
+                    cudaGetLastError( ), t_elapsed );
+            fprintf( stderr, "Hydrogen_Bond Done... \n" );
+#endif
+        }
+
+        compute_bonded_part1 = FALSE;
+    }
+
+    return ret;
+}
+
+
+void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, 
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
+{
+    /* van der Waals and Coulomb interactions */
+    Cuda_NonBonded_Energy( system, control, workspace, data,
+            lists, out_control, (control->tabulate == 0) ? false: true );
+}
+
+
+void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, mpi_datatypes *mpi_data )
+{
+    rvec *f;
+
+    f = (rvec *) host_scratch;
+    memset( f, 0, sizeof(rvec) * system->N );
+
+    Cuda_Total_Forces( system, control, data, workspace );
+
+#if defined(PURE_REAX)
+    /* now all forces are computed to their partially-final values
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+
+    //MVAPICH2
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
+            cudaMemcpyDeviceToHost, "total_force:f:get" );
+
+    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
+          sizeof(rvec) / sizeof(void), rvec_unpacker );
+
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
+            cudaMemcpyHostToDevice, "total_force:f:put" );
+
+    Cuda_Total_Forces_PURE( system, dev_workspace );
+#endif
+
+}
+
+
+int Cuda_Compute_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int charge_flag, retVal;
+    static int init_forces_done = FALSE;
+
+#if defined(LOG_PERFORMANCE)
+    real t_start = 0;
+
+    //MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        t_start = Get_Time( );
+    }
+#endif
+
+    retVal = SUCCESS;
+
+    /********* init forces ************/
+    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
+    {
+        charge_flag = TRUE;
+    }
+    else
+    {
+        charge_flag = FALSE;
+    }
+
+    if ( init_forces_done == FALSE )
+    {
+        if ( charge_flag == TRUE )
+        {
+            retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
+        }
+        else
+        {
+            retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
+        }
+
+        if ( retVal == SUCCESS )
+        {
+            init_forces_done = TRUE;
+        }
+    }
+
+    if ( retVal == SUCCESS )
+    {
+        //validate_sparse_matrix( system, workspace );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.init_forces) );
+        }
+#endif
+
+        /********* bonded interactions ************/
+        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: completed bonded\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
+
+    if ( retVal == SUCCESS )
+    {
+    /**************** charges ************************/
+#if defined(PURE_REAX)
+        if ( charge_flag == TRUE )
+        {
+            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+        }
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.cm) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+#endif //PURE_REAX
+
+        /********* nonbonded interactions ************/
+        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.nonb) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
+                system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /*********** total force ***************/
+        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: total forces computed\n",
+                system->my_rank, data->step );
+//        Print_Total_Force( system, data, workspace );
+        MPI_Barrier( MPI_COMM_WORLD );
+
+#endif
+
+        init_forces_done = FALSE;
+    }
+
+    return retVal;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h
new file mode 100644
index 0000000000000000000000000000000000000000..94d3b73f8ba30d35c8589b0e111952da41a3c116
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_forces.h
@@ -0,0 +1,49 @@
+
+#ifndef __CUDA_FORCES_H__
+#define __CUDA_FORCES_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void Cuda_Init_HBond_Indices( reax_system * );
+
+void Cuda_Init_Bond_Indices( reax_system * );
+
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * );
+
+void Cuda_Init_Three_Body_Indices( int *, int );
+
+void Cuda_Estimate_Storages( reax_system *, control_params *, reax_list **,
+        int, int, int, int );
+
+int Cuda_Estimate_Storage_Three_Body( reax_system *, control_params *,
+        int, reax_list **, int *, int * );
+
+int Cuda_Init_Forces( reax_system *, control_params *, simulation_data *,
+        storage *, reax_list **, output_controls * );
+
+int Cuda_Init_Forces_No_Charges( reax_system *, control_params *, simulation_data *,
+        storage *, reax_list **, output_controls * );
+
+int Cuda_Compute_Bonded_Forces( reax_system *, control_params *, simulation_data *,
+        storage *, reax_list **, output_controls * );
+
+void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
+        simulation_data *, storage *, reax_list **, output_controls *,
+        mpi_datatypes * );
+
+int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_helpers.h b/PG-PuReMD/src/cuda/cuda_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..b14f45b331b5ff6bfcee4d3ac2a9c8df626b775b
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_helpers.h
@@ -0,0 +1,68 @@
+#ifndef __CUDA_HELPERS__
+#define __CUDA_HELPERS__
+
+#include "../reax_types.h"
+
+
+CUDA_DEVICE static inline int cuda_strcmp( char * a,
+        char * b, int len )
+{
+    int i;
+    char *src, *dst;
+
+    src = a;
+    dst = b;
+
+    for ( i = 0; i < len; i++ )
+    {
+        if ( *dst == '\0' )
+        {
+            return FALSE;
+        }
+
+        if ( *src != *dst )
+        {
+            return TRUE;
+        }
+
+        src++;
+        dst++;
+    }
+
+    return FALSE;
+}
+
+
+CUDA_DEVICE static inline real myatomicAdd( real* address, real val )
+{
+    unsigned long long int* address_as_ull =
+        (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do
+    {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
+    }
+    while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+
+
+CUDA_DEVICE static inline void atomic_rvecAdd( rvec ret, rvec v )
+{
+    myatomicAdd( &ret[0], v[0] );
+    myatomicAdd( &ret[1], v[1] );
+    myatomicAdd( &ret[2], v[2] );
+}
+
+
+CUDA_DEVICE static inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
+{
+    myatomicAdd( &ret[0], c * v[0] );
+    myatomicAdd( &ret[1], c * v[1] );
+    myatomicAdd( &ret[2], c * v[2] );
+}
+
+#endif
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
similarity index 50%
rename from PG-PuReMD/src/cuda_hydrogen_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index 358c5073ec2232d1bfddae0a4b46dd89d235cb72..9043e424db5215211f71e1d912c57d073673986d 100644
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -19,43 +19,32 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-#include "index_utils.h"
-
 #include "cuda_hydrogen_bonds.h"
+
 #include "cuda_valence_angles.h"
 #include "cuda_helpers.h"
-#include "dev_list.h"
-#include "vector.h"
-
+#include "cuda_list.h"
 #include "cuda_shuffle.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
 
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        hbond_parameters *d_hbp,
-        global_parameters gp, 
-        control_params *control, 
-        storage p_workspace, 
-        reax_list p_bonds, 
-        reax_list p_hbonds, 
-        int n, 
-        int num_atom_types, 
-        real *data_e_hb, 
-        rvec *data_ext_press)
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, 
+        hbond_parameters *d_hbp, global_parameters gp, control_params *control, 
+        storage p_workspace, reax_list p_bonds, reax_list p_hbonds, int n, 
+        int num_atom_types, real *data_e_hb, rvec *data_ext_press, int rank, int step )
 {
-    int  i, j, k, pi, pk;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, hb_start_j, hb_end_j;
-    int  hblist[MAX_BONDS];
-    int  itr, top;
-    int  num_hb_intrs = 0;
+    int i, j, k, pi, pk;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int itr, top;
     ivec rel_jk;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     bond_data *pbond_ij;
@@ -63,47 +52,57 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
     reax_list *bonds, *hbonds;
     bond_data *bond_list;
     hbond_data *hbond_list, *hbond_jk;
-    storage *workspace = &( p_workspace );
+    storage *workspace;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
 
     bonds = &( p_bonds );
     bond_list = bonds->select.bond_list;
-    hbonds = & ( p_hbonds );
+    hbonds = &( p_hbonds );
     hbond_list = hbonds->select.hbond_list;
-
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n) return;
+    workspace = &( p_workspace );
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
-       here j is H atom and there has to be some bond between i and j.
-       Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map 
-       variables onto the ones in the handout.*/
+     * here j is H atom and there has to be some bond between i and j.
+     * Hydrogen bond is between j and k.
+     * so in this function i->X, j->H, k->Z when we map 
+     * variables onto the ones in the handout. */
     //for( j = 0; j < system->n; ++j )
-    /* j has to be of type H */
-    if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
-        /*set j's variables */
-        type_j     = my_atoms[j].type;
-        start_j    = Dev_Start_Index(j, bonds);
-        end_j      = Dev_End_Index(j, bonds);
+    if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM )
+    {
+        type_j = my_atoms[j].type;
+        start_j = Dev_Start_Index( j, bonds );
+        end_j = Dev_End_Index( j, bonds );
         hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
-        hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
+        hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds );
 
         top = 0;
-        for( pi = start_j; pi < end_j; ++pi )  {
+        /* search bonded atoms to atom j (i.e., hydrogen atom) for potential hydrogen bonding */
+        for ( pi = start_j; pi < end_j; ++pi )
+        {
             pbond_ij = &( bond_list[pi] );
             i = pbond_ij->nbr;
             bo_ij = &(pbond_ij->bo_data);
             type_i = my_atoms[i].type;
 
-            if( sbp[type_i].p_hbond == 2 && 
+            if ( sbp[type_i].p_hbond == H_BONDING_ATOM && 
                     bo_ij->BO >= HB_THRESHOLD )
+            {
                 hblist[top++] = pi;
+            }
         }
 
-        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-        //          j, top, hb_start_j, hb_end_j );
+//        fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
+//                j, top, hb_start_j, hb_end_j );
 
-        for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+        /* for each hbond of atom j */
+        for ( pk = hb_start_j; pk < hb_end_j; ++pk )
+        {
             /* set k's varibles */
             k = hbond_list[pk].nbr;
             type_k = my_atoms[k].type;
@@ -111,20 +110,21 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
             r_jk = nbr_jk->d;
             rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
 
-            hbond_jk = &( hbond_list [pk] );
-            rvec_MakeZero (hbond_jk->hb_f);
+            hbond_jk = &( hbond_list[pk] );
+            rvec_MakeZero( hbond_jk->hb_f );
 
-            for( itr = 0; itr < top; ++itr ) {
+            /* find matching hbond to atom k */
+            for ( itr = 0; itr < top; ++itr )
+            {
                 pi = hblist[itr];
                 pbond_ij = &( bonds->select.bond_list[pi] );
                 i = pbond_ij->nbr;
 
-                if( my_atoms[i].orig_id != my_atoms[k].orig_id ) {
+                if ( my_atoms[i].orig_id != my_atoms[k].orig_id )
+                {
                     bo_ij = &(pbond_ij->bo_data);
                     type_i = my_atoms[i].type;
-                    r_ij = pbond_ij->d;         
-                    hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
-                    ++num_hb_intrs;
+                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
 
                     Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
@@ -133,24 +133,42 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
                             &dcos_theta_di, &dcos_theta_dj, 
                             &dcos_theta_dk );
 
-                    /* hyrogen bond energy*/
-                    sin_theta2 = SIN( theta/2.0 );
-                    sin_xhz4 = SQR(sin_theta2);
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                    {
+//                        printf( "[0] p%05d, top = %d, itr = %d, MAX_BONDS = %d\n", rank, top, itr, MAX_BONDS );
+//                        printf( "[1] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k, theta, cos_theta, dcos_theta_di,
+//                                dcos_theta_dj, dcos_theta_dk );
+//                    }
+
+                    /* hydrogen bond energy */
+                    sin_theta2 = SIN( theta / 2.0 );
+                    sin_xhz4 = SQR( sin_theta2 );
                     sin_xhz4 *= sin_xhz4;
                     cos_xhz1 = ( 1.0 - cos_theta );
                     exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
                     exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
                                 r_jk / hbp->r0_hb - 2.0 ) );
 
-                    //data_e_hb [j] += 
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                    {
+//                        printf( "[2] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                sin_theta2, sin_xhz4, cos_xhz1, exp_hb2, exp_hb3 );
+//                        printf( "[3] p%05d %05d, %05d: %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                hbp->p_hb3, hbp->r0_hb, r_jk );
+//                    }
+
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    data_e_hb [j] += e_hb;
+                    data_e_hb[j] += e_hb;
 
                     CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
                     CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
                     CEhb3 = -hbp->p_hb3 * 
                         (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
 
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                        printf( "[4] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                e_hb, data_e_hb[j], CEhb1, CEhb2, CEhb3 );
+
                     /*fprintf( stdout, 
                       "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
                       system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
@@ -161,10 +179,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
                     /* hydrogen bond forces */
                     bo_ij->Cdbo += CEhb1; // dbo term
 
-                    if( control->virial == 0 ) {
-                        // dcos terms
+                    if ( control->virial == 0 )
+                    {
+                        /* dcos terms */
                         //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
-                        //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
+                        //atomic_rvecScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di );
                         rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
 
                         rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
@@ -173,21 +192,22 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
                         //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
                         rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
 
-                        // dr terms
+                        /* dr terms */
                         rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 
                         //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
                         //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
                         rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
                     }
-                    else {
+                    else
+                    {
                         /* for pressure coupling, terms that are not related to bond order
-                           derivatives are added directly into pressure vector/tensor */
+                         * derivatives are added directly into pressure vector/tensor */
                         rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
                         //rvec_Add( workspace->f[i], force );
                         rvec_Add( pbond_ij->hb_f, force );
                         rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
+                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
 
                         rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
 
@@ -197,10 +217,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
                         rvec_Add( hbond_jk->hb_f, force );
                         rvec_iMultiply( ext_press, rel_jk, force );
                         rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-                        // dr terms
+                        /* dr terms */
                         rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 
-                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
                         //rvec_Add( workspace->f[k], force );
                         rvec_Add( hbond_jk->hb_f, force );
                         rvec_iMultiply( ext_press, rel_jk, force );
@@ -222,6 +242,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
                             system->my_atoms[k].orig_id, 
                             r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );       
 #endif
+
 #ifdef TEST_FORCES
                     Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term
                     // dcos terms
@@ -239,60 +260,40 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms,
 }
 
 
-
 //CUDA_GLOBAL void __launch_bounds__ (256, 4) Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, 
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        hbond_parameters *d_hbp,
-        global_parameters gp, 
-        control_params *control, 
-        storage p_workspace, 
-        reax_list p_bonds, 
-        reax_list p_hbonds, 
-        int n, 
-        int num_atom_types, 
-        real *data_e_hb, 
-        rvec *data_ext_press)
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parameters *sbp, 
+        hbond_parameters *d_hbp, global_parameters gp, control_params *control, 
+        storage p_workspace, reax_list p_bonds, reax_list p_hbonds, int n, 
+        int num_atom_types, real *data_e_hb, rvec *data_ext_press )
 {
-
 #if defined( __SM_35__)
     real sh_hb;
     real sh_cdbo;
     rvec sh_atomf;
     rvec sh_hf;
 #else
-
     extern __shared__ real t_hb[];
     extern __shared__ rvec t__f[];
     extern __shared__ rvec t_cdbo[];
     extern __shared__ rvec t_hf [];
-
     real *sh_hb = t_hb;
     real *sh_cdbo = t_hb + blockDim.x;
     rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
-    rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+    rvec *sh_hf = (rvec *)(sh_atomf + blockDim.x);
 #endif
-
-    int __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warp_id = thread_id / __THREADS_PER_ATOM__;
-    int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-
-    if (warp_id >= n ) return;
-
-    int  i, j, k, pi, pk;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, hb_start_j, hb_end_j;
-    int  hblist[MAX_BONDS];
-    int  itr, top;
-    int  num_hb_intrs = 0;
+    int __THREADS_PER_ATOM__, thread_id, group_id, lane_id; 
+    int i, j, k, pi, pk;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    //TODO: re-write and remove
+    int hblist[MAX_BONDS];
+    int itr, top;
+    int loopcount, count;
     ivec rel_jk;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     bond_data *pbond_ij;
@@ -300,18 +301,24 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
     reax_list *bonds, *hbonds;
     bond_data *bond_list;
     hbond_data *hbond_list, *hbond_jk;
-    storage *workspace = &( p_workspace );
+    storage *workspace;
 
+    __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM;
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    group_id = thread_id / __THREADS_PER_ATOM__;
+    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); 
+
+    if ( group_id >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
     bonds = &( p_bonds );
     bond_list = bonds->select.bond_list;
-    hbonds = & ( p_hbonds );
+    hbonds = &( p_hbonds );
     hbond_list = hbonds->select.hbond_list;
-
-    /*
-       j = blockIdx.x * blockDim.x + threadIdx.x;
-       if (j >= n) return;
-     */
-    j = warp_id;
+    j = group_id;
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
        here j is H atom and there has to be some bond between i and j.
@@ -321,63 +328,67 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
     //for( j = 0; j < system->n; ++j )
 
 #if defined( __SM_35__)
-    sh_hb  = 0;
-    rvec_MakeZero ( sh_atomf );
+    sh_hb = 0;
+    rvec_MakeZero( sh_atomf );
 #else
-    sh_hb [threadIdx.x] = 0;
-    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+    sh_hb[threadIdx.x] = 0;
+    rvec_MakeZero( sh_atomf[threadIdx.x] );
 #endif
 
     /* j has to be of type H */
-    if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
-        /*set j's variables */
-        type_j     = my_atoms[j].type;
-        start_j    = Dev_Start_Index(j, bonds);
-        end_j      = Dev_End_Index(j, bonds);
+    if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM )
+    {
+        /* set j's variables */
+        type_j = my_atoms[j].type;
+        start_j = Dev_Start_Index(j, bonds);
+        end_j = Dev_End_Index(j, bonds);
         hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
-        hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
+        hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds );
 
         top = 0;
-        for( pi = start_j; pi < end_j; ++pi )  {
+        for ( pi = start_j; pi < end_j; ++pi ) 
+        {
             pbond_ij = &( bond_list[pi] );
             i = pbond_ij->nbr;
             bo_ij = &(pbond_ij->bo_data);
             type_i = my_atoms[i].type;
 
-            if( sbp[type_i].p_hbond == 2 && 
+            if ( sbp[type_i].p_hbond == H_BONDING_ATOM && 
                     bo_ij->BO >= HB_THRESHOLD )
+            {
                 hblist[top++] = pi;
+            }
         }
 
-        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-        //          j, top, hb_start_j, hb_end_j );
+//        fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
+//                j, top, hb_start_j, hb_end_j );
 
-        for( itr = 0; itr < top; ++itr ) {
+        for ( itr = 0; itr < top; ++itr )
+        {
             pi = hblist[itr];
             pbond_ij = &( bonds->select.bond_list[pi] );
             i = pbond_ij->nbr;
 
 #if defined( __SM_35__)
-            rvec_MakeZero (sh_hf );
-            sh_cdbo  = 0;
+            rvec_MakeZero( sh_hf );
+            sh_cdbo = 0;
 #else
-            rvec_MakeZero (sh_hf [threadIdx.x]);
-            sh_cdbo [threadIdx.x] = 0;
+            rvec_MakeZero( sh_hf[threadIdx.x] );
+            sh_cdbo[threadIdx.x] = 0;
 #endif
 
-
             //for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
-            int loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + 
+            loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + 
                 (((hb_end_j - hb_start_j) % HB_KER_THREADS_PER_ATOM == 0) ? 0 : 1);
 
-            int count = 0;
+            count = 0;
             pk = hb_start_j + lane_id;
-            while (count < loopcount)
+            while ( count < loopcount )
             {
-
-                if (pk < hb_end_j)
+                /* only allow threads with an actual hbond */
+                if ( pk < hb_end_j )
                 {
-                    hbond_jk = &( hbond_list [pk] );
+                    hbond_jk = &( hbond_list[pk] );
 
                     /* set k's varibles */
                     k = hbond_list[pk].nbr;
@@ -386,27 +397,25 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
                     r_jk = nbr_jk->d;
                     rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
                 }
-                else k = -1;
-
-
-                if( (my_atoms[i].orig_id != my_atoms[k].orig_id)
-                        && (k != -1) ) {
+                else
+                {
+                    k = -1;
+                }
 
+                if ( my_atoms[i].orig_id != my_atoms[k].orig_id && k != -1 )
+                {
                     bo_ij = &(pbond_ij->bo_data);
                     type_i = my_atoms[i].type;
-                    r_ij = pbond_ij->d;         
-                    hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
-                    ++num_hb_intrs;
+                    hbp = &(d_hbp[ index_hbp(type_i,type_j,type_k,num_atom_types) ]);
 
                     Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
                     /* the derivative of cos(theta) */
                     Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                            &dcos_theta_di, &dcos_theta_dj, 
-                            &dcos_theta_dk );
+                            &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
 
-                    /* hyrogen bond energy*/
-                    sin_theta2 = SIN( theta/2.0 );
+                    /* hydrogen bond energy */
+                    sin_theta2 = SIN( theta / 2.0 );
                     sin_xhz4 = SQR(sin_theta2);
                     sin_xhz4 *= sin_xhz4;
                     cos_xhz1 = ( 1.0 - cos_theta );
@@ -414,14 +423,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
                     exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
                                 r_jk / hbp->r0_hb - 2.0 ) );
 
-                    //data_e_hb [j] += 
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    //data_e_hb [j] += e_hb;
-
 #if defined( __SM_35__)
                     sh_hb += e_hb;
 #else
-                    sh_hb [threadIdx.x] += e_hb;
+                    sh_hb[threadIdx.x] += e_hb;
 #endif
 
                     CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
@@ -437,48 +443,40 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
                       exp_hb3, sin_xhz4, e_hb ); */
 
                     /* hydrogen bond forces */
-                    //        bo_ij->Cdbo += CEhb1; // dbo term
 #if defined( __SM_35__)
-                    sh_cdbo += CEhb1;
+                    sh_cdbo += CEhb1; // dbo term
 #else
-                    sh_cdbo[threadIdx.x] += CEhb1;
+                    sh_cdbo[threadIdx.x] += CEhb1; // dbo term
 #endif
 
-                    if( control->virial == 0 ) {
+                    if ( control->virial == 0 )
+                    {
                         // dcos terms
-                        //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
-                        //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
-                        //rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
 #if defined( __SM_35__)
-                        rvec_ScaledAdd( sh_hf , +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( sh_hf, +CEhb2, dcos_theta_di ); 
 #else
-                        rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( sh_hf[threadIdx.x], +CEhb2, dcos_theta_di ); 
 #endif
 
-                        //rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
 #if defined( __SM_35__)
-                        rvec_ScaledAdd( sh_atomf , +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf, +CEhb2, dcos_theta_dj );
 #else
-                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf[threadIdx.x], +CEhb2, dcos_theta_dj );
 #endif
 
-                        //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
-                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
                         rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
 
                         // dr terms
-                        //rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 #if defined( __SM_35__)
-                        rvec_ScaledAdd( sh_atomf , -CEhb3/r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( sh_atomf, -CEhb3/r_jk, dvec_jk ); 
 #else
-                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( sh_atomf[threadIdx.x], -CEhb3/r_jk, dvec_jk ); 
 #endif
 
-                        //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
-                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
                         rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
                     }
-                    else {
+                    else
+                    {
                         /* for pressure coupling, terms that are not related to bond order
                            derivatives are added directly into pressure vector/tensor */
                         rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
@@ -508,255 +506,333 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms,
                 } //orid id end
 
                 pk += __THREADS_PER_ATOM__;
-                count ++;
+                count++;
 
             } //for itr loop end
 
             //Reduction here
 #if defined( __SM_35__)
-            for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
+            for ( int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2 )
+            {
                 sh_cdbo += shfl( sh_cdbo, s);
                 sh_hf[0] += shfl( sh_hf[0], s);
                 sh_hf[1] += shfl( sh_hf[1], s);
                 sh_hf[2] += shfl( sh_hf[2], s);
             }
             //end of the shuffle
-            if (lane_id == 0) {
+            if ( lane_id == 0 )
+            {
                 bo_ij->Cdbo += sh_cdbo ;
-                rvec_Add (pbond_ij->hb_f, sh_hf );
+                rvec_Add( pbond_ij->hb_f, sh_hf );
             }
 #else
-            if (lane_id < 16) {
-                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+            if ( lane_id < 16 )
+            {
+                sh_cdbo[threadIdx.x] += sh_cdbo[threadIdx.x + 16];
+                rvec_Add( sh_hf [threadIdx.x], sh_hf[threadIdx.x + 16] );
             }
-            if (lane_id < 8) {
-                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+            if ( lane_id < 8 )
+            {
+                sh_cdbo[threadIdx.x] += sh_cdbo[threadIdx.x + 8];
+                rvec_Add( sh_hf [threadIdx.x], sh_hf[threadIdx.x + 8] );
             }
-            if (lane_id < 4) {
-                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+            if ( lane_id < 4 )
+            {
+                sh_cdbo[threadIdx.x] += sh_cdbo[threadIdx.x + 4];
+                rvec_Add( sh_hf [threadIdx.x], sh_hf[threadIdx.x + 4] );
             }
-            if (lane_id < 2) {
-                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+            if ( lane_id < 2 )
+            {
+                sh_cdbo[threadIdx.x] += sh_cdbo[threadIdx.x + 2];
+                rvec_Add( sh_hf [threadIdx.x], sh_hf[threadIdx.x + 2] );
             }
-            if (lane_id < 1) {
-                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+            if ( lane_id < 1 )
+            {
+                sh_cdbo[threadIdx.x] += sh_cdbo[threadIdx.x + 1];
+                rvec_Add( sh_hf [threadIdx.x], sh_hf[threadIdx.x + 1] );
 
-                bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-                rvec_Add (pbond_ij->hb_f, sh_hf [threadIdx.x]);
+                bo_ij->Cdbo += sh_cdbo[threadIdx.x];
+                rvec_Add( pbond_ij->hb_f, sh_hf[threadIdx.x] );
             }
 #endif
-
-
         } // for loop hbonds end
-        } //if Hbond check end
+    } //if Hbond check end
 
 #if defined( __SM_35__)
-        for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
-            sh_hb += shfl( sh_hb, s);
-            sh_atomf[0] += shfl( sh_atomf[0], s);
-            sh_atomf[1] += shfl( sh_atomf[1], s);
-            sh_atomf[2] += shfl( sh_atomf[2], s);
-        }
-        if (lane_id == 0){
-            data_e_hb[j] += sh_hb;
-            rvec_Add (workspace->f[j], sh_atomf);
-        }
-
-
+    for ( int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2 )
+    {
+        sh_hb += shfl( sh_hb, s);
+        sh_atomf[0] += shfl( sh_atomf[0], s);
+        sh_atomf[1] += shfl( sh_atomf[1], s);
+        sh_atomf[2] += shfl( sh_atomf[2], s);
+    }
+    if ( lane_id == 0 )
+    {
+        data_e_hb[j] += sh_hb;
+        rvec_Add( workspace->f[j], sh_atomf );
+    }
 #else
-        if (lane_id < 16){
-            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-        }
-        if (lane_id < 8){
-            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-        }
-        if (lane_id < 4){
-            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-        }
-        if (lane_id < 2){
-            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-        }
-        if (lane_id < 1){
-            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-
-            data_e_hb[j] += sh_hb [threadIdx.x];
-            rvec_Add (workspace->f[j], sh_atomf [threadIdx.x]);
-        }
-#endif
-
+    if ( lane_id < 16 )
+    {
+        sh_hb[threadIdx.x] += sh_hb[threadIdx.x + 16];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf[threadIdx.x + 16] );
+    }
+    if ( lane_id < 8 )
+    {
+        sh_hb[threadIdx.x] += sh_hb[threadIdx.x + 8];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf[threadIdx.x + 8] );
+    }
+    if ( lane_id < 4 )
+    {
+        sh_hb[threadIdx.x] += sh_hb[threadIdx.x + 4];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf[threadIdx.x + 4] );
     }
+    if ( lane_id < 2 )
+    {
+        sh_hb[threadIdx.x] += sh_hb[threadIdx.x + 2];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf[threadIdx.x + 2] );
+    }
+    if ( lane_id < 1 )
+    {
+        sh_hb[threadIdx.x] += sh_hb[threadIdx.x + 1];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf[threadIdx.x + 1] );
 
+        data_e_hb[j] += sh_hb[threadIdx.x];
+        rvec_Add( workspace->f[j], sh_atomf[threadIdx.x] );
+    }
+#endif
+}
 
 
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *atoms,
+        storage p_workspace, reax_list p_bonds, int N )
+{
+    int i, pj;
+    storage *workspace;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    reax_list *bonds;
 
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *atoms,
-            storage p_workspace,
-            reax_list p_bonds, int N)
+    if ( i >= N )
     {
-        int i, pj;
-
-        storage *workspace = &( p_workspace );
-        bond_data *pbond;
-        bond_data *sym_index_bond;
-        reax_list *bonds = &p_bonds;
-
-        i = blockIdx.x * blockDim.x + threadIdx.x;
-        if ( i >= N) return;
+        return;
+    }
 
-        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+    workspace = &p_workspace;
+    bonds = &p_bonds;
 
-            pbond = &(bonds->select.bond_list[pj]);
-            sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+    for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
+        pbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[pbond->sym_index] );
 
-            //rvec_Add (atoms[i].f, sym_index_bond->hb_f );
-            rvec_Add (workspace->f[i], sym_index_bond->hb_f );
-        }
+        //rvec_Add( atoms[i].f, sym_index_bond->hb_f );
+        rvec_Add( workspace->f[i], sym_index_bond->hb_f );
     }
+}
 
-    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *atoms,
-            storage p_workspace,
-            reax_list p_hbonds )
-    {
 
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
+        storage p_workspace, reax_list p_hbonds )
+{
 #if defined(__SM_35__)
-        rvec __f;
+    rvec __f;
 #else
-        extern __shared__ rvec __f[];
+    extern __shared__ rvec __f[];
 #endif
-        int i, pj,j;
-        int start, end;
-
-        storage *workspace = &( p_workspace );
-        hbond_data *nbr_pj, *sym_index_nbr;
-        reax_list *hbonds = &p_hbonds;
-
-        i = blockIdx.x;
-
-        start = Dev_Start_Index (i, hbonds);
-        end = Dev_End_Index (i, hbonds);
-        pj = start + threadIdx.x;
+    int i, pj;
+    int start, end;
+    storage *workspace;
+    hbond_data *nbr_pj, *sym_index_nbr;
+    reax_list *hbonds;
+
+    i = blockIdx.x;
+    workspace = &p_workspace;
+    hbonds = &p_hbonds;
+
+    start = Dev_Start_Index( atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( atoms[i].Hindex, hbonds );
+    pj = start + threadIdx.x;
 #if defined(__SM_35__)
-        rvec_MakeZero (__f);
+    rvec_MakeZero( __f );
 #else
-        rvec_MakeZero (__f[threadIdx.x]);
+    rvec_MakeZero( __f[threadIdx.x] );
 #endif
 
-        while (pj < end)
-        {
-            nbr_pj = &( hbonds->select.hbond_list[pj] );
-            j = nbr_pj->nbr;
+    while ( pj < end )
+    {
+        nbr_pj = &( hbonds->select.hbond_list[pj] );
 
-            sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+        sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 
 #if defined(__SM_35__)
-            rvec_Add (__f, sym_index_nbr->hb_f );
+        rvec_Add( __f, sym_index_nbr->hb_f );
 #else
-            rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
+        rvec_Add( __f[threadIdx.x], sym_index_nbr->hb_f );
 #endif
 
-            pj += blockDim.x;
-        }
+        pj += blockDim.x;
+    }
+
+    __syncthreads( );
 
 #if defined(__SM_35__)
-        for (int s = 16; s >= 1; s/=2){
-            __f[0] += shfl( __f[0], s);
-            __f[1] += shfl( __f[1], s);
-            __f[2] += shfl( __f[2], s);
-        }
+    for ( int s = 16; s >= 1; s /= 2 )
+    {
+        __f[0] += shfl( __f[0], s );
+        __f[1] += shfl( __f[1], s );
+        __f[2] += shfl( __f[2], s );
+    }
 
-        if (threadIdx.x == 0)
-            rvec_Add (workspace->f[i], __f);
+    if ( threadIdx.x == 0 )
+    {
+        rvec_Add( workspace->f[i], __f );
+    }
 #else
-        if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-        if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-        if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-        if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-        if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-
-        if (threadIdx.x == 0)
-            //rvec_Add (atoms[i].f, __f[0]);
-            rvec_Add (workspace->f[i], __f[0]);
-#endif
+    if ( threadIdx.x < 16 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] );
     }
+    __syncthreads( );
 
-    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *atoms,
-            storage p_workspace,
-            reax_list p_hbonds, int N )
+    if ( threadIdx.x < 8 )
     {
-#if defined(__SM_35__)
-        rvec __f;
-#else
-        extern __shared__ rvec __f[];
-#endif
-        int i, pj,j;
-        int start, end;
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] );
+    }
+    __syncthreads( );
 
-        storage *workspace = &( p_workspace );
-        hbond_data *nbr_pj, *sym_index_nbr;
-        reax_list *hbonds = &p_hbonds;
+    if ( threadIdx.x < 4 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] );
+    }
+    __syncthreads( );
 
-        int __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM;
+    if ( threadIdx.x < 2 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] );
+    }
+    __syncthreads( );
 
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-        int warp_id = thread_id / __THREADS_PER_ATOM__;
-        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-        if (warp_id >= N ) return;
+    if ( threadIdx.x < 1 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] );
+    }
+    __syncthreads( );
+
+    if ( threadIdx.x == 0 )
+    {
+        //rvec_Add( atoms[i].f, __f[0] );
+        rvec_Add( workspace->f[i], __f[0] );
+    }
+#endif
+}
 
-        i = warp_id;
 
-        start = Dev_Start_Index (i, hbonds);
-        end = Dev_End_Index (i, hbonds);
-        pj = start + lane_id;
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
+        storage p_workspace, reax_list p_hbonds, int N )
+{
+#if defined(__SM_35__)
+    rvec __f;
+    int s;
+#else
+    extern __shared__ rvec __f[];
+#endif
+    int i, pj;
+    int start, end;
+    storage *workspace;
+    hbond_data *nbr_pj, *sym_index_nbr;
+    reax_list *hbonds;
+    int __THREADS_PER_ATOM__;
+    int thread_id;
+    int group_id;
+    int lane_id; 
+
+    __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM;
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    group_id = thread_id / __THREADS_PER_ATOM__;
+    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
+
+    if ( group_id >= N )
+    {
+        return;
+    }
 
+    workspace = &( p_workspace );
+    hbonds = &p_hbonds;
+    i = group_id;
+    start = Dev_Start_Index( atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( atoms[i].Hindex, hbonds );
+    pj = start + lane_id;
 #if defined(__SM_35__)
-        rvec_MakeZero (__f);
+    rvec_MakeZero( __f );
 #else
-        rvec_MakeZero (__f[threadIdx.x]);
+    rvec_MakeZero( __f[threadIdx.x] );
 #endif
 
-        while (pj < end)
-        {
-            nbr_pj = &( hbonds->select.hbond_list[pj] );
-            j = nbr_pj->nbr;
+    while ( pj < end )
+    {
+        nbr_pj = &(hbonds->select.hbond_list[pj]);
 
-            sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+        sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 #if defined(__SM_35__)
-            rvec_Add (__f, sym_index_nbr->hb_f );
+        rvec_Add( __f, sym_index_nbr->hb_f );
 #else
-            rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
+        rvec_Add( __f[threadIdx.x], sym_index_nbr->hb_f );
 #endif
 
-            pj += __THREADS_PER_ATOM__;
-        }
+        pj += __THREADS_PER_ATOM__;
+    }
+
+    __syncthreads( );
 
 #if defined(__SM_35__)
-        for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
-            __f[0] += shfl( __f[0], s);
-            __f[1] += shfl( __f[1], s);
-            __f[2] += shfl( __f[2], s);
-        }
+    for ( s = __THREADS_PER_ATOM__ >> 1; s >= 1; s /= 2 )
+    {
+        __f[0] += shfl( __f[0], s );
+        __f[1] += shfl( __f[1], s );
+        __f[2] += shfl( __f[2], s );
+    }
 
-        if (lane_id == 0)
-            rvec_Add (workspace->f[i], __f);
+    if ( lane_id == 0 )
+    {
+        rvec_Add( workspace->f[i], __f );
+    }
 #else
-        if (lane_id < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-        if (lane_id < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-        if (lane_id < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-        if (lane_id < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-        if (lane_id < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-
-        if (lane_id == 0)
-            rvec_Add (workspace->f[i], __f[threadIdx.x]);
-#endif
+    if ( lane_id < 16 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] );
+    }
+    __syncthreads( );
+
+    if ( lane_id < 8 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] );
+    }
+    __syncthreads( );
+
+    if ( lane_id < 4 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] );
     }
+    __syncthreads( );
+
+    if ( lane_id < 2 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] );
+    }
+    __syncthreads( );
+
+    if ( lane_id < 1 )
+    {
+        rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] );
+    }
+    __syncthreads( );
+
+    if ( lane_id == 0 )
+    {
+        rvec_Add( workspace->f[i], __f[threadIdx.x] );
+    }
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
similarity index 53%
rename from PG-PuReMD/src/cuda_multi_body.h
rename to PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
index 332e6f06a480b61f424ec6dbc3ad7fbeb9bba1b2..aa09d2f7f2903ce2bed367a719af4c1b4f0baa2d 100644
--- a/PG-PuReMD/src/cuda_multi_body.h
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
@@ -1,42 +1,48 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __CUDA_MULTI_BODY_H_
-#define __CUDA_MULTI_BODY_H_
-
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Atom_Energy(  reax_atom *,
-                                    global_parameters ,
-                                    single_body_parameters *,
-                                    two_body_parameters *,
-                                    storage ,
-                                    reax_list ,
-                                    int ,
-                                    int ,
-                                    real *,
-                                    real *,
-                                    real *
-                                 );
-
-CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess (reax_list, storage, int );
-
-#endif
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_HBONDS_H_
+#define __CUDA_HBONDS_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters, control_params *, storage ,
+        reax_list, reax_list, int, int, real *, rvec *, int, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters , control_params *, storage,
+        reax_list, reax_list, int, int, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *,
+        storage, reax_list );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *,
+        storage, reax_list, int );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b115f2870de362525282cbaea220c762b706baf1
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -0,0 +1,380 @@
+
+#include "cuda_init_md.h"
+
+#include "cuda_allocate.h"
+#include "cuda_list.h"
+#include "cuda_copy.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_neighbors.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+#include "cuda_validation.h"
+
+#if defined(PURE_REAX)
+  #include "../box.h"
+  #include "../comm_tools.h"
+  #include "../grid.h"
+  #include "../init_md.h"
+  #include "../io_tools.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+  #include "../lookup.h"
+#ifdef __cplusplus
+}
+#endif
+  #include "../random.h"
+  #include "../reset_tools.h"
+  #include "../tool_box.h"
+  #include "../vector.h"
+#elif defined(LAMMPS_REAX)
+  #include "../reax_box.h"
+  #include "../reax_comm_tools.h"
+  #include "../reax_grid.h"
+  #include "../reax_init_md.h"
+  #include "../reax_io_tools.h"
+  #include "../reax_list.h"
+  #include "../reax_lookup.h"
+  #include "../reax_random.h"
+  #include "../reax_reset_tools.h"
+  #include "../reax_tool_box.h"
+  #include "../reax_vector.h"
+#endif
+
+
+void Cuda_Init_ScratchArea( )
+{
+    cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" );
+
+    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" );
+}
+
+
+int Cuda_Init_System( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    int i;
+    int nrecv[MAX_NBRS];
+
+    Setup_New_Grid( system, control, MPI_COMM_WORLD );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d GRID:\n", system->my_rank );
+    Print_Grid( &(system->my_grid), stderr );
+#endif
+
+    Bin_My_Atoms( system, &(workspace->realloc) );
+    Reorder_My_Atoms( system, workspace );
+
+    /* estimate N and total capacity */
+    for ( i = 0; i < MAX_NBRS; ++i )
+    {
+        nrecv[i] = 0;
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    system->max_recved = 0;
+    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
+    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
+    Bin_Boundary_Atoms( system );
+
+    /* Sync atoms here to continue the computation */
+    dev_alloc_system( system );
+    Sync_System( system );
+
+    /* estimate numH and Hcap */
+    Cuda_Reset_Atoms( system, control );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
+             system->my_rank, system->n, system->local_cap );
+    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
+             system->my_rank, system->N, system->total_cap );
+    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
+             system->my_rank, system->numH, system->Hcap );
+#endif
+
+    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
+
+    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
+
+//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
+//    {
+//        return FAILURE;
+//    }
+
+    /* initialize velocities so that desired init T can be attained */
+    if ( !control->restart || (control->restart && control->random_vel) )
+    {
+        Cuda_Generate_Initial_Velocities( system, control->T_init );
+    }
+
+    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+    return SUCCESS;
+}
+
+
+void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, char *msg )
+{
+    dev_alloc_simulation_data( data );
+
+    Reset_Simulation_Data( data );
+
+    if ( !control->restart )
+    {
+        data->step = data->prev_steps = 0;
+    }
+
+    switch ( control->ensemble )
+    {
+    case NVE:
+        data->N_f = 3 * system->bigN;
+        Cuda_Evolve = Cuda_Velocity_Verlet_NVE;
+        control->virial = 0;
+        break;
+
+    case bNVT:
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
+        control->virial = 0;
+        break;
+
+    case nhNVT:
+        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        control->virial = 0;
+        if ( !control->restart || (control->restart && control->random_vel) )
+        {
+            data->therm.G_xi = control->Tau_T *
+                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->therm.v_xi_old = 0;
+            data->therm.xi = 0;
+        }
+        break;
+
+    case sNPT: /* Semi-Isotropic NPT */
+        data->N_f = 3 * system->bigN + 4;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case iNPT: /* Isotropic NPT */
+        data->N_f = 3 * system->bigN + 2;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case NPT: /* Anisotropic NPT */
+        data->N_f = 3 * system->bigN + 9;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+
+        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+        break;
+
+    default:
+        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+    }
+
+    /* initialize the timer(s) */
+    MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.start = Get_Time( );
+
+#if defined(LOG_PERFORMANCE)
+        Reset_Timing( &data->timing );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
+#endif
+}
+
+
+void Cuda_Init_Workspace( reax_system *system, control_params *control,
+        storage *workspace, char *msg )
+{
+    dev_alloc_workspace( system, control, dev_workspace,
+            system->local_cap, system->total_cap, msg );
+
+    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
+    Cuda_Reset_Workspace( system, workspace );
+
+    /* Initialize the Taper function */
+    Init_Taper( control, dev_workspace );
+}
+
+
+void Cuda_Init_Lists( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    Cuda_Estimate_Neighbors( system );
+
+    Dev_Make_List( system->total_cap, system->total_far_nbrs,
+            TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
+            system->my_rank, system->total_far_nbrs,
+            (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+    fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap );
+#endif
+
+    Cuda_Init_Neighbor_Indices( system );
+
+    Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
+
+    /* estimate storage for bonds, hbonds, and sparse matrix */
+    Cuda_Estimate_Storages( system, control, dev_lists,
+            TRUE, TRUE, TRUE, data->step );
+
+    dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
+    Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, space=%dMB\n",
+            system->my_rank, system->total_cm_entries,
+            (int)(system->total_cm_entries * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+#endif
+
+    if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
+    {
+        Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
+        Cuda_Init_HBond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
+                system->my_rank, system->total_hbonds,
+                (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+#endif
+    }
+
+    /* bonds list */
+    Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
+    Cuda_Init_Bond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
+            system->my_rank, total_bonds,
+            (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) );
+#endif
+
+    /* 3bodies list: since a more accurate estimate of the num.
+     * three body interactions requires that bond orders have
+     * been computed, delay estimation until for computation */
+}
+
+
+void Cuda_Initialize( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
+{
+    char msg[MAX_STR];
+
+    /* HOST/DEVICE SCRATCH */
+    Cuda_Init_ScratchArea( );
+
+    /* MPI_DATATYPES */
+    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
+                 system->my_rank );
+        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* SYSTEM */
+    if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* GRID */
+    dev_alloc_grid( system );
+    Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+    //validate_grid( system );
+
+    /* SIMULATION_DATA */
+    Cuda_Init_Simulation_Data( system, control, data, msg );
+
+    /* WORKSPACE */
+    Cuda_Init_Workspace( system, control, workspace, msg );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
+#endif
+
+    /* CONTROL */
+    dev_alloc_control( control );
+
+    /* LISTS */
+    Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
+#endif
+
+    /* OUTPUT Files */
+    if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: could not open output files! terminating...\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
+#endif
+
+    /* Lookup Tables */
+    if ( control->tabulate )
+    {
+        if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE )
+        {
+            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
+                     system->my_rank );
+            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
+#endif
+}
+
+
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.h b/PG-PuReMD/src/cuda/cuda_init_md.h
new file mode 100644
index 0000000000000000000000000000000000000000..328674a5961fdc905b1b3f57ea085ab1d60e17af
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.h
@@ -0,0 +1,22 @@
+
+#ifndef __CUDA_INIT_MD_H__
+#define __CUDA_INIT_MD_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Cuda_Init_ScratchArea( );
+
+void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1de784a5f48ec52f92b066776421241e49c0005
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -0,0 +1,755 @@
+
+#include "cuda_integrate.h"
+
+#include "cuda_allocate.h"
+#include "cuda_box.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_copy.h"
+#include "cuda_neighbors.h"
+#include "cuda_reduction.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+
+#include "../comm_tools.h"
+#include "../grid.h"
+#include "../vector.h"
+
+
+CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 1st part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+}
+
+
+void update_velocity_part1( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    reax_atom *atom;
+    real inv_m;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 2nd part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+}
+
+
+void update_velocity_part2( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_nhNVT_update_velocity_1( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 1st part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    rvec_Copy( atom->f_old, atom->f );
+}
+
+
+void nhNVT_update_velocity_part1( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_nhNVT_update_velocity_2( reax_atom *my_atoms, rvec * v_const,
+        single_body_parameters *sbp, real dt, real v_xi, int n )
+{
+    reax_atom *atom;
+    real inv_m;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 2nd part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_Scale( v_const[i], 1.0 - 0.5 * dt * v_xi, atom->v );
+    rvec_ScaledAdd( v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f_old );
+    rvec_ScaledAdd( v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f );
+}
+
+
+void nhNVT_update_velocity_part2( reax_system *system, storage *workspace, real dt, real v_xi )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_nhNVT_update_velocity_3( reax_atom *my_atoms, rvec *v_const,
+        single_body_parameters *sbp, real dt, real v_xi_old, real * my_ekin, int n )
+{
+    reax_atom *atom;
+    real coef_v;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    atom = &(my_atoms[i]);
+    coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+    rvec_Scale( atom->v, coef_v, v_const[i] );
+    my_ekin[i] = (0.5 * sbp[atom->type].mass * rvec_Dot(atom->v, atom->v));
+}
+
+
+int nhNVT_update_velocity_part3( reax_system *system, storage *workspace,
+       real dt, real v_xi_old, real * d_my_ekin, real * d_total_my_ekin )
+{
+    int blocks, my_ekin;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_3 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi_old, d_my_ekin, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n );
+
+    copy_host_device( &my_ekin, d_total_my_ekin, sizeof(int), 
+            cudaMemcpyDeviceToHost, "nhNVT_update_velocity_part3::d_total_my_ekin" );
+
+    return my_ekin;
+}
+
+
+CUDA_GLOBAL void k_bNVT_scale_velocities( reax_atom *my_atoms, real lambda, int n )
+{
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* Scale velocities and positions at t+dt */
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+}
+
+
+void bNVT_scale_velocities( reax_system *system, real lambda )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_bNVT_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, lambda, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_bNVP_scale_velocities( reax_atom *my_atoms, real lambda,
+        real mu0, real mu1, real mu2, int n )
+{
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* Scale velocities and positions at t+dt */
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+//    rvec_Multiply( atom->x, mu, atom->x );
+    atom->x[0] = mu0 * atom->x[0];
+    atom->x[1] = mu1 * atom->x[1];
+    atom->x[2] = mu2 * atom->x[2];
+}
+
+
+void bNVP_scale_velocities( reax_system *system, real lambda, rvec mu )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_bNVP_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, lambda, mu[0], mu[1], mu[2], system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int steps, renbr, ret;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
+    real dt;
+#if defined(DEBUG)
+    real t_over_start, t_over_elapsed;
+#endif
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step %d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    if ( verlet_part1_done == FALSE )
+    {
+        update_velocity_part1( system, dt );
+
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* sync grid to device */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    }
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr && far_nbrs_done == FALSE )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
+        {
+            Cuda_Estimate_Neighbors( system );
+        }
+        if ( ret == SUCCESS )
+        {
+            far_nbrs_done = TRUE;
+        }
+    
+#if defined(DEBUG)
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
+#endif
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        update_velocity_part2( system, dt );
+
+        verlet_part1_done = FALSE;
+        far_nbrs_done = FALSE;
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
+
+
+int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
+        control_params* control, simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int itr, steps, renbr, ret;
+    real *d_my_ekin, *d_total_my_ekin;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
+    real dt, dt_sqr;
+    real my_ekin, new_ekin;
+    real G_xi_new, v_xi_new, v_xi_old;
+    thermostat *therm;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    dt_sqr = SQR(dt);
+    therm = &( data->therm );
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+
+    if ( verlet_part1_done == FALSE )
+    {
+        nhNVT_update_velocity_part1( system, dt );
+    
+        /* Compute xi(t + dt) */
+        therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* sync grid to device */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    }
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr && far_nbrs_done == FALSE )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
+        {
+            Cuda_Estimate_Neighbors( system );
+        }
+        if ( ret == SUCCESS )
+        {
+            far_nbrs_done = TRUE;
+        }
+
+#if defined(DEBUG)
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
+#endif
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        /* Compute iteration constants for each atom's velocity */
+        nhNVT_update_velocity_part2( system, dev_workspace, dt, therm->v_xi );
+    
+        v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+        my_ekin = G_xi_new = v_xi_old = 0;
+        itr = 0;
+
+        cuda_malloc( (void **) &d_my_ekin, sizeof(real) * system->n, FALSE,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" );
+        cuda_malloc( (void **) &d_total_my_ekin, sizeof(real), FALSE,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_total_my_ekin" );
+
+        do
+        {
+            itr++;
+    
+            /* new values become old in this iteration */
+            v_xi_old = v_xi_new;
+    
+            my_ekin = nhNVT_update_velocity_part3( system, dev_workspace, dt, v_xi_old,
+                    d_my_ekin, d_total_my_ekin );
+    
+            MPI_Allreduce( &my_ekin, &new_ekin, 1, MPI_DOUBLE, MPI_SUM,
+                    mpi_data->comm_mesh3D  );
+    
+            G_xi_new = control->Tau_T * ( 2.0 * new_ekin - data->N_f * K_B * control->T );
+            v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        }
+        while ( FABS(v_xi_new - v_xi_old) > 1e-5 );
+        therm->v_xi_old = therm->v_xi;
+        therm->v_xi = v_xi_new;
+        therm->G_xi = G_xi_new;
+
+        cuda_free( d_total_my_ekin,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_total_my_ekin" );
+        cuda_free( d_my_ekin,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" );
+
+        verlet_part1_done = FALSE;
+        far_nbrs_done = FALSE;
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
+
+
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
+   there is no change in the angles between axes. */
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int steps, renbr, ret;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
+    real dt, lambda;
+#if defined(DEBUG)
+    real t_over_start, t_over_elapsed;
+#endif
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    if ( verlet_part1_done == FALSE )
+    {
+        /* velocity verlet, 1st part */
+        update_velocity_part1( system, dt );
+
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* sync grid to device */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    
+        Cuda_Reset( system, control, data, workspace, lists );
+    }
+    else
+    {
+        Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+    
+        Cuda_Reset( system, control, data, workspace, lists );
+    }
+
+    if ( renbr && far_nbrs_done == FALSE )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
+        {
+            Cuda_Estimate_Neighbors( system );
+        }
+        if ( ret == SUCCESS )
+        {
+            far_nbrs_done = TRUE;
+        }
+        
+#if defined(DEBUG)
+        t_over_elapsed  = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
+#endif
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        /* velocity verlet, 2nd part */
+        update_velocity_part2( system, dt );
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /* temperature scaler */
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+        if ( lambda < MIN_dT )
+        {
+            lambda = MIN_dT;
+        }
+        else if (lambda > MAX_dT )
+        {
+            lambda = MAX_dT;
+        }
+        lambda = SQRT( lambda );
+
+        /* Scale velocities and positions at t+dt */
+        bNVT_scale_velocities( system, lambda );
+
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        verlet_part1_done = FALSE;
+        far_nbrs_done = FALSE;
+    }
+
+    return ret;
+}
+
+
+/* uses Berendsen-type coupling for both T and P.
+ * All box dimensions are scaled by the same amount,
+ * there is no change in the angles between axes. */
+int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int steps, renbr, ret;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
+    real dt;
+#if defined(DEBUG)
+    real t_over_start, t_over_elapsed;
+#endif
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step %d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    if ( verlet_part1_done == FALSE )
+    {
+        update_velocity_part1( system, dt );
+
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* sync grid to device */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    }
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr && far_nbrs_done == FALSE )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
+        {
+            Cuda_Estimate_Neighbors( system );
+        }
+        if ( ret == SUCCESS )
+        {
+            far_nbrs_done = TRUE;
+        }
+    
+#if defined(DEBUG)
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
+#endif
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        update_velocity_part2( system, dt );
+
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+        Cuda_Compute_Pressure( system, control, data, mpi_data );
+        Cuda_Scale_Box( system, control, data, mpi_data );
+
+        verlet_part1_done = FALSE;
+        far_nbrs_done = FALSE;
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d893419abd1c0c4397641cbad9fd0f42f888696
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_integrate.h
@@ -0,0 +1,55 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_INTEGRATE_H_
+#define __CUDA_INTEGRATE_H_
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bNVP_scale_velocities( reax_system *, real, rvec );
+
+int Cuda_Velocity_Verlet_NVE( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
+int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
+int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bb35d181168f0bfd25950fd5bc1f8cce0a538f7b
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -0,0 +1,1110 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_lin_alg.h"
+
+#include "cuda_shuffle.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../basic_comm.h"
+
+
+//one thread per row
+CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results,
+        int rows )
+{
+    int i, col;
+    real results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows )
+    {
+        return;
+    }
+
+    results_row = 0;
+
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row += val * vec[col];
+    }
+
+    results[i] = results_row;
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
+CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results,
+        int num_rows )
+{
+#if defined(__SM_35__)
+    real vals;
+    int x;
+#else
+    extern __shared__ real vals[];
+#endif
+    int jj;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+    
+#if defined(__SM_35__)
+    vals = 0;
+#else
+    vals[threadIdx.x] = 0;
+#endif
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for ( jj = row_start + lane; jj < row_end;
+                jj += MATVEC_KER_THREADS_PER_ROW )
+#if defined(__SM_35__)
+        {
+            vals += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+#else
+        {
+            vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+
+    __syncthreads( );
+#endif
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+#if defined(__SM_35__)
+    for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
+    {
+        vals += shfl( vals, x );
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals;
+    }
+#else
+    if (lane < 16)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 16];
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 8];
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 4];
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 2];
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 1];
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals[threadIdx.x];
+    }
+#endif
+}
+
+
+//one thread per row
+CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results,
+        int rows )
+{
+    int i, c, col;
+    rvec2 results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows)
+    {
+        return;
+    }
+
+    results_row[0] = 0.0;
+    results_row[1] = 0.0;
+
+    for (c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row[0] += val * vec [col][0];
+        results_row[1] += val * vec [col][1];
+    }
+
+    results[i][0] = results_row[0];
+    results[i][1] = results_row[1];
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
+CUDA_GLOBAL void  k_dual_matvec_csr( sparse_matrix H, rvec2 *vec,
+        rvec2 *results, int num_rows )
+{
+#if defined(__SM_35__)
+    rvec2 rvals;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+
+    rvals[0] = 0;
+    rvals[1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
+        {
+            rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2)
+    {
+        rvals[0] += shfl( rvals[0], s);
+        rvals[1] += shfl( rvals[1], s);
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[0];
+        results[row][1] = rvals[1];
+    }
+
+#else
+    extern __shared__ rvec2 rvals[];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+
+    rvals[threadIdx.x][0] = 0;
+    rvals[threadIdx.x][1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for(int jj = row_start + lane; jj < row_end; jj += 32)
+        {
+            rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    __syncthreads( );
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+    if (lane < 16)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; 
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; 
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; 
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; 
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; 
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[threadIdx.x][0];
+        results[row][1] = rvals[threadIdx.x][1];
+    }
+
+#endif
+}
+
+
+void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count )
+{
+    //res = ax + by
+    //use the cublas here
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, x, b, y, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
+{
+    //res = a*b - vector multiplication
+    //use the cublas here.
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, b, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
+{
+    storage *workspace;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+
+    //for( j = 0; j < system->n; ++j ) {
+    // residual 
+    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
+    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+
+    // apply diagonal pre-conditioner
+    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
+    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
+    //}
+}
+
+
+void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, b, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, 
+        real alpha_0, real alpha_1, int n, rvec2 *my_dot )
+{
+    storage *workspace;
+    rvec2 alpha;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    alpha[0] = alpha_0;
+    alpha[1] = alpha_1;
+    my_dot[j][0] = my_dot[j][1] = 0.0;
+
+    //for( j = 0; j < system->n; ++j ) {
+    // update x 
+    x[j][0] += alpha[0] * workspace->d2[j][0];
+    x[j][1] += alpha[1] * workspace->d2[j][1];      
+
+    // update residual 
+    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
+    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
+
+    // apply diagonal pre-conditioner 
+    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+
+    // dot product: r.p 
+    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
+    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
+    //}
+}
+
+
+void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
+        int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1),
+            "cuda_dualcg_preconditioner" );
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, x, alpha[0], alpha[1], n, tmp);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    //Reduction to calculate my_dot
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( tmp, tmp + n, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        ( tmp + n, tmp + 2*n, blocks);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
+            cudaMemcpyDeviceToHost, "my_dot" );
+}
+
+
+void Cuda_Norm( rvec2 *arr, int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        (arr, tmp, n, INITIAL);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
+}
+
+
+void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( a, b, tmp, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+    //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_dot" );
+}
+
+
+void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
+        ( x, a, b[0], b[1], c, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i] = src[i][index];
+}
+
+
+void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i][index] = src[i];
+}
+
+
+void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
+{
+//    int blocks;
+
+//    blocks = (n / DEF_BLOCK_SIZE) + 
+//        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
+
+    /* one thread per row implementation */
+//    k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( *H, a, b, n );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    //One warp per row implementation
+#if defined(__SM_35__)
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                      sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
+#endif
+            ( *H, a, b, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
+{
+//    int blocks;
+
+//    blocks = (n / DEF_BLOCK_SIZE) + 
+//        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
+
+    /* one thread per row implementation */
+//    k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( *H, a, b, n );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+#if defined(__SM_35__)
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                 sizeof(real) * MATVEC_BLOCK_SIZE >>>
+#endif
+         ( *H, a, b, n );
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspace,
+        sparse_matrix *H, rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data,
+        FILE *fout, simulation_data *data )
+{
+    int i, n, matvecs, scale;
+//    int j, N;
+    rvec2 tmp, alpha, beta;
+    rvec2 my_sum, norm_sqr, b_norm, my_dot;
+    rvec2 sig_old, sig_new;
+    MPI_Comm comm;
+    rvec2 *spad = (rvec2 *) host_scratch;
+
+    n = system->n;
+//    N = system->N;
+    comm = mpi_data->world;
+    matvecs = 0;
+    scale = sizeof(rvec2) / sizeof(void);
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        matvecs = 0;
+        t_start = matvec_time = dot_time = 0;
+        t_start = Get_Time( );
+    }
+#endif
+
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+//#endif
+
+//  check_zeros_device( x, system->N, "x" );
+
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
+    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
+
+//  check_zeros_device( x, system->N, "x" );
+
+//  compare_rvec2 (workspace->x, x, N, "x");
+//  if (data->step > 0) {
+//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
+//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
+//
+//      exit (0);
+//  }
+
+
+//#ifdef __CUDA_DEBUG__
+//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
+//#endif
+    //originally we were using only H->n which was system->n (init_md.c)
+    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
+    
+    Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap );
+
+//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+//  if (data->step > 0) exit (0);
+
+    // tryQEq
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+    
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyDeviceToHost, "CG:q2:get" );
+    Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyHostToDevice,"CG:q2:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+//#ifdef __CUDA_DEBUG__
+//  for( j = 0; j < system->n; ++j ) {
+//    // residual
+//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
+//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
+//    // apply diagonal pre-conditioner
+//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//  }
+//#endif
+    
+    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
+
+//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
+//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
+
+    /* norm of b */
+//#ifdef __CUDA_DEBUG__
+//  my_sum[0] = my_sum[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_sum[0] += SQR( workspace->b[j][0] );
+//    my_sum[1] += SQR( workspace->b[j][1] );
+//  }
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+//#endif
+
+    my_sum[0] = 0;
+    my_sum[1] = 0;
+    Cuda_Norm( b, n, my_sum );
+
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+
+    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
+    b_norm[0] = SQRT( norm_sqr[0] );
+    b_norm[1] = SQRT( norm_sqr[1] );
+    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
+
+    /* dot product: r.d */
+//#ifdef __CUDA_DEBUG__
+//  my_dot[0] = my_dot[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
+//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
+//  }
+//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+    my_dot[0] = 0;
+    my_dot[1] = 0;
+    Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n );
+
+// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+    
+    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < control->cm_solver_max_iters; ++i )
+    {
+        //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
+//#endif
+        
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:d2:get" );
+        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:d2:put" );
+
+        //print_device_rvec2 (dev_workspace->d2, N);
+
+//#ifdef __CUDA_DEBUG__
+//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
+//#endif
+        
+        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
+                system->total_cap );
+
+        /*
+        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
+        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
+        print_sparse_matrix_host (&workspace->H);
+        fprintf (stderr, " ******* HOST Vector ***************\n");
+        print_host_rvec2 (workspace->d2, system->N);
+        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
+        print_sparse_matrix (&dev_workspace->H);
+        fprintf (stderr, " ******* Device Vector ***************\n");
+        print_device_rvec2 (dev_workspace->d2, system->N);
+        */
+        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+        // tryQEq
+        // MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:q2:get" );
+        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:q2:put" );
+
+//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        /* dot product: d.q */
+//#ifdef __CUDA_DEBUG__
+//    my_dot[0] = my_dot[1] = 0;
+//    for( j = 0; j < n; ++j ) {
+//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
+//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = my_dot[1] = 0;
+        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
+        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
+
+        alpha[0] = sig_new[0] / tmp[0];
+        alpha[1] = sig_new[1] / tmp[1];
+        my_dot[0] = 0;
+        my_dot[1] = 0;
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // update x
+//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
+//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
+//      // update residual
+//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
+//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
+//      // apply diagonal pre-conditioner
+//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//      // dot product: r.p
+//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
+//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = 0;
+        my_dot[1] = 0;
+        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
+
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
+//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
+//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
+
+        sig_old[0] = sig_new[0];
+        sig_old[1] = sig_new[1];
+        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+
+        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        beta[0] = sig_new[0] / sig_old[0];
+        beta[1] = sig_new[1] / sig_old[1];
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // d = p + beta * d
+//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
+//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
+//    }
+//#endif
+
+        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
+                dev_workspace->d2, system->n );
+
+//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
+    }
+
+
+    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->t[j] = workspace->x[j][1];
+        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
+
+        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
+
+        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
+
+        matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
+                mpi_data );
+
+        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
+        //for( j = 0; j < n; ++j )
+        //  workspace->x[j][1] = workspace->t[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
+    }
+    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->s[j] = workspace->x[j][0];
+
+        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
+
+        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
+        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+
+        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
+
+        matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
+                mpi_data );
+
+        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->x[j][0] = workspace->s[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
+    }
+
+    if ( i >= control->cm_solver_max_iters )
+    {
+        fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
+                system->my_rank, i );
+        fprintf( stderr, "    [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] );
+        fprintf( stderr, "    [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] );
+    }
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
+                i + 1, matvecs, matvec_time, dot_time );
+    }
+#endif
+
+    return (i + 1) + matvecs;
+}
+
+
+int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
+        sparse_matrix *H, real *b, real tol, real *x, mpi_datatypes* mpi_data )
+{
+    int  i, scale;
+//    int j;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new;
+    real *spad = (real *) host_scratch;
+
+    scale = sizeof(real) / sizeof(void);
+
+    /* x is on the device */
+    //MVAPICH2
+    memset( spad, 0, sizeof(real) * system->total_cap );
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
+    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+
+    //MVAPICH2
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:x:put" );
+    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
+
+    // tryQEq
+    // MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+
+    //MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:q:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
+            system->n );
+    //for( j = 0; j < system->n; ++j )
+    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
+            dev_workspace->Hdia_inv, system->n );
+
+    //TODO do the parallel_norm on the device for the local sum
+    copy_host_device( spad, b, sizeof(real) * system->n,
+            cudaMemcpyDeviceToHost, "cuda_cg:b:get" );
+    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
+
+    //TODO do the parallel dot on the device for the local sum
+    copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+    copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
+            mpi_data->world );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < control->cm_solver_max_iters && SQRT(sig_new) / b_norm > tol; ++i )
+    {
+        //MVAPICH2
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:d:put" );
+
+        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
+
+        //tryQEq
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:q:get" );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
+
+        alpha = sig_new / tmp;
+        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
+        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
+
+        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
+                dev_workspace->r, system->n );
+        /* pre-conditioning */
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
+                dev_workspace->Hdia_inv, system->n );
+
+        sig_old = sig_new;
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+        copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
+        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
+        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
+
+        beta = sig_new / sig_old;
+        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
+                dev_workspace->d, system->n );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+    }
+
+    return i;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h
new file mode 100644
index 0000000000000000000000000000000000000000..768c0a36a1582a585f5d45161844a23640a68212
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h
@@ -0,0 +1,65 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_LIN_ALG_H_
+#define __CUDA_LIN_ALG_H_
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Cuda_Vector_Sum( real *, real, real *, real, real *, int );
+
+void Cuda_CG_Preconditioner( real *, real *, real *, int );
+
+void Cuda_CG_Diagonal_Preconditioner( storage *, rvec2 *, int );
+
+void Cuda_DualCG_Preconditioner( storage *, rvec2 *, rvec2, int, rvec2 );
+
+void Cuda_Norm( rvec2 *, int, rvec2 );
+
+void Cuda_Dot( rvec2 *, rvec2 *, rvec2, int );
+
+void Cuda_Vector_Sum_Rvec2( rvec2 *, rvec2 *, rvec2, rvec2 *, int );
+
+void Cuda_RvecCopy_From( real *, rvec2 *, int, int );
+
+void Cuda_RvecCopy_To( rvec2 *, real *, int, int );
+
+void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int );
+
+void Cuda_Matvec( sparse_matrix *, real *, real *, int , int );
+
+int Cuda_dual_CG( reax_system*, control_params*, storage*, sparse_matrix*,
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
+
+int Cuda_CG( reax_system*, control_params*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_list.cu b/PG-PuReMD/src/cuda/cuda_list.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9d0626f126eff2b87ea5fef72eb7ca6ec036adc8
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_list.cu
@@ -0,0 +1,114 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_utils.h"
+
+#if defined(PURE_REAX)
+  #include "../list.h"
+  #include "../tool_box.h"
+#elif defined(LAMMPS_REAX)
+  #include "../reax_list.h"
+  #include "../reax_tool_box.h"
+#endif
+
+
+extern "C" {
+
+
+/************* allocate list space ******************/
+void Dev_Make_List( int n, int num_intrs, int type, reax_list *l )
+{
+    l->allocated = TRUE;
+
+    l->n = n;
+    l->num_intrs = num_intrs;
+
+    cuda_malloc( (void **) &l->index, n * sizeof(int), TRUE, "dev_list:index" );
+    cuda_malloc( (void **) &l->end_index, n * sizeof(int), TRUE, "dev_list:end_index" );
+
+    l->type = type;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "dev_list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type );
+#endif
+
+    switch( l->type )
+    {
+        case TYP_FAR_NEIGHBOR:
+            cuda_malloc( (void **) &l->select.far_nbr_list, 
+                    l->num_intrs * sizeof(far_neighbor_data), TRUE, "dev_list:far_nbrs" );
+            break;
+
+        case TYP_THREE_BODY:
+            cuda_malloc( (void **) &l->select.three_body_list,
+                    l->num_intrs * sizeof(three_body_interaction_data), TRUE,
+                    "dev_list:three_bodies" );
+            break;
+
+        case TYP_HBOND:
+            cuda_malloc( (void **) &l->select.hbond_list, 
+                    l->num_intrs * sizeof(hbond_data), TRUE, "dev_list:hbonds" );
+            break;            
+
+        case TYP_BOND:
+            cuda_malloc( (void **) &l->select.bond_list,
+                    l->num_intrs * sizeof(bond_data), TRUE, "dev_list:bonds" );
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] no %d dev_list type defined!\n", l->type );
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
+}
+
+
+void Dev_Delete_List( reax_list *l )
+{
+    if( l->allocated == FALSE )
+    {
+        return;
+    }
+    l->allocated = FALSE;
+
+    cuda_free( l->index, "dev_index" );
+    cuda_free( l->end_index, "dev_end_index" );
+
+    switch (l->type)
+    {
+        case TYP_HBOND:
+            cuda_free( l->select.hbond_list, "dev_list:hbonds" );
+            break;
+        case TYP_FAR_NEIGHBOR:
+            cuda_free( l->select.far_nbr_list, "dev_list:far_nbrs" );
+            break;
+        case TYP_BOND:
+            cuda_free( l->select.bond_list, "dev_list:bonds" );
+            break;
+        case TYP_THREE_BODY:
+            cuda_free( l->select.three_body_list, "dev_list:three_bodies" );
+            break;
+        default:
+            fprintf (stderr, "[ERROR] no %d dev_list type defined !\n", l->type);
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
+}
+
+}
diff --git a/PG-PuReMD/src/dev_list.h b/PG-PuReMD/src/cuda/cuda_list.h
similarity index 58%
rename from PG-PuReMD/src/dev_list.h
rename to PG-PuReMD/src/cuda/cuda_list.h
index a8d1f11a8214d247d496b4464cce93e87203c0a5..fe06f4ce92c114842d8cc5f3c65ab7c9b0683661 100644
--- a/PG-PuReMD/src/dev_list.h
+++ b/PG-PuReMD/src/cuda/cuda_list.h
@@ -19,54 +19,49 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __LIST_H_
-#define __LIST_H_
+#ifndef __CUDA_LIST_H_
+#define __CUDA_LIST_H_
+
+#include "../reax_types.h"
 
-#include "reax_types.h"
-#include "reax_types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-int  Dev_Make_List( int, int, int, reax_list*);
-void Dev_Delete_List( reax_list*);
+void Dev_Make_List( int, int, int, reax_list* );
+
+void Dev_Delete_List( reax_list* );
 
 #ifdef __cplusplus
 }
 #endif
 
-/*
-CUDA_HOST_DEVICE int  Dev_Num_Entries(int,reax_list*);
-CUDA_HOST_DEVICE int  Dev_Start_Index( int, reax_list* );
-CUDA_HOST_DEVICE int  Dev_End_Index( int, reax_list* );
-CUDA_HOST_DEVICE void Dev_Set_Start_Index(int,int,reax_list*);
-CUDA_HOST_DEVICE void Dev_Set_End_Index(int,int,reax_list*);
-*/
 
-inline CUDA_HOST_DEVICE int Dev_Num_Entries( int i, reax_list *l )
+static inline CUDA_HOST_DEVICE int Dev_Num_Entries( int i, reax_list *l )
 {
     return l->end_index[i] - l->index[i];
 }
 
-inline CUDA_HOST_DEVICE int Dev_Start_Index( int i, reax_list *l )
+static inline CUDA_HOST_DEVICE int Dev_Start_Index( int i, reax_list *l )
 {
     return l->index[i];
 }
 
-inline CUDA_HOST_DEVICE int Dev_End_Index( int i, reax_list *l )
+static inline CUDA_HOST_DEVICE int Dev_End_Index( int i, reax_list *l )
 {
     return l->end_index[i];
 }
 
-inline CUDA_HOST_DEVICE void Dev_Set_Start_Index( int i, int val, reax_list *l )
+static inline CUDA_HOST_DEVICE void Dev_Set_Start_Index( int i, int val, reax_list *l )
 {
     l->index[i] = val;
 }
 
-inline CUDA_HOST_DEVICE void Dev_Set_End_Index( int i, int val, reax_list *l )
+static inline CUDA_HOST_DEVICE void Dev_Set_End_Index( int i, int val, reax_list *l )
 {
     l->end_index[i] = val;
 }
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu
new file mode 100644
index 0000000000000000000000000000000000000000..01bc8a79e9689f538dfb0bf6421b815998997b27
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_lookup.cu
@@ -0,0 +1,88 @@
+
+#include "cuda_lookup.h"
+
+#include "cuda_utils.h"
+
+#include "../index_utils.h"
+
+
+void copy_LR_table_to_device( reax_system *system, control_params *control,
+        int *aggregated )
+{
+    int i, j;
+    int num_atom_types;
+    LR_data *d_y;
+    cubic_spline_coef *temp;
+
+    num_atom_types = system->reax_param.num_atom_types;
+
+    fprintf( stderr, "Copying the LR Lookyp Table to the device ... \n" );
+
+    cuda_malloc( (void **) &d_LR,
+            sizeof(LR_lookup_table) * ( num_atom_types * num_atom_types ),
+            FALSE, "LR_lookup:table" );
+
+    /*
+       for( i = 0; i < MAX_ATOM_TYPES; ++i )
+       existing_types[i] = 0;
+
+       for( i = 0; i < system->N; ++i )
+       existing_types[ system->atoms[i].type ] = 1;
+     */
+
+    copy_host_device( LR, d_LR,
+            sizeof(LR_lookup_table) * (num_atom_types * num_atom_types), 
+            cudaMemcpyHostToDevice, "LR_lookup:table" );
+
+    for( i = 0; i < num_atom_types; ++i )
+    {
+        if ( aggregated[i] )
+        {
+            for( j = i; j < num_atom_types; ++j )
+            {
+                if ( aggregated[j] )
+                {
+                    cuda_malloc( (void **) &d_y,
+                            sizeof(LR_data) * (control->tabulate + 1), FALSE, "LR_lookup:d_y" );
+                    copy_host_device( LR[ index_lr(i, j, num_atom_types) ].y, d_y, 
+                            sizeof(LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y" );
+                    copy_host_device ( &d_y, &d_LR[ index_lr(i, j, num_atom_types) ].y, 
+                            sizeof(LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y" );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:h" );
+                    copy_host_device( LR [ index_lr (i, j, num_atom_types) ].H, temp, 
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h" );
+                    copy_host_device( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, 
+                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h" );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:vdW" );
+                    copy_host_device( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, 
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW" );
+                    copy_host_device( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,
+                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW" );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:CEvd" );
+                    copy_host_device( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, 
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd" );
+                    copy_host_device( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, 
+                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ele" );
+                    copy_host_device( LR [ index_lr (i, j, num_atom_types) ].ele, temp,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele" );
+                    copy_host_device( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele,
+                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele" );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ceclmb" );
+                    copy_host_device( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" );
+                    copy_host_device( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb,
+                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" );
+                }
+            }
+        }
+    }
+
+    fprintf( stderr, "Copy of the LR Lookup Table to the device complete ... \n" );
+}
+
diff --git a/PG-PuReMD/src/cuda_lookup.h b/PG-PuReMD/src/cuda/cuda_lookup.h
similarity index 56%
rename from PG-PuReMD/src/cuda_lookup.h
rename to PG-PuReMD/src/cuda/cuda_lookup.h
index 88f5cfce17d54995431f356440e6250d40209b01..87026f7deab8464867471e5373bac70018895f0a 100644
--- a/PG-PuReMD/src/cuda_lookup.h
+++ b/PG-PuReMD/src/cuda/cuda_lookup.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_LOOKUP_H__
 #define __CUDA_LOOKUP_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void copy_LR_table_to_device (reax_system *, control_params *, int *);
+void copy_LR_table_to_device( reax_system *, control_params *, int * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
similarity index 81%
rename from PG-PuReMD/src/cuda_multi_body.cu
rename to PG-PuReMD/src/cuda/cuda_multi_body.cu
index 24a510050523fc88081904fa416b4d9f8391726a..9a328061f4524a8a3c1ea93a2ba3974fdac8bde6 100644
--- a/PG-PuReMD/src/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -19,25 +19,18 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
 #include "cuda_multi_body.h"
-#include "index_utils.h"
+
 #include "cuda_helpers.h"
-#include "dev_list.h"
-
-
-CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, 
-        global_parameters gp, 
-        single_body_parameters *sbp, 
-        two_body_parameters *tbp, 
-        storage p_workspace, 
-        reax_list p_bonds, 
-        int n, 
-        int num_atom_types,
-        real *data_elp,
-        real *data_eov, 
-        real *data_eun
-        )
+#include "cuda_list.h"
+
+#include "../index_utils.h"
+
+
+CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        storage p_workspace, reax_list p_bonds, int n, int num_atom_types,
+        real *data_elp, real *data_eov, real *data_eun )
 {
     int i, j, pj, type_i, type_j;
     real Delta_lpcorr, dfvl;
@@ -48,22 +41,24 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     real exp_ovun2n, exp_ovun6, exp_ovun8;
     real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
     real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-    real p_lp1, p_lp2, p_lp3;
+    real p_lp2, p_lp3;
     real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-    single_body_parameters *sbp_i, *sbp_j;
+    single_body_parameters *sbp_i;
     two_body_parameters *twbp;
     bond_data *pbond;
     bond_order_data *bo_ij; 
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if ( i >= n )
+    {
+        return;
+    }
 
     reax_list *bonds = &( p_bonds );
     storage *workspace = &( p_workspace );
 
     /* Initialize parameters */
-    p_lp1 = gp.l[15];
     p_lp3 = gp.l[5];
     p_ovun3 = gp.l[32];
     p_ovun4 = gp.l[31];
@@ -82,8 +77,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     inv_expvd2 = 1. / (1. + expvd2 );
 
     /* calculate the energy */
-    data_elp [i] += e_lp = 
-        p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    data_elp[i] += e_lp;
 
     dElp = p_lp2 * inv_expvd2 + 
         75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
@@ -99,38 +94,48 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
             system->my_atoms[i].orig_id, workspace->nlp[i], 
             e_lp, data->my_en.e_lp );
 #endif
+
 #ifdef TEST_FORCES
     Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
 #endif
 
     /* correction for C2 */
-    if( gp.l[5] > 0.001 &&
+    if ( gp.l[5] > 0.001 &&
             !cuda_strcmp( sbp[type_i].name, "C", 1 ) )
-        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
-            if( my_atoms[i].orig_id < 
-                    my_atoms[bonds->select.bond_list[pj].nbr].orig_id ) {
+    {
+        for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+        {
+            if ( my_atoms[i].orig_id < 
+                    my_atoms[bonds->select.bond_list[pj].nbr].orig_id )
+            {
                 j = bonds->select.bond_list[pj].nbr;
                 type_j = my_atoms[j].type;
 
-                if( !cuda_strcmp( sbp[type_j].name, "C", 1 ) ) {
+                if ( !cuda_strcmp( sbp[type_j].name, "C", 1 ) )
+                {
                     twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]);
                     bo_ij = &( bonds->select.bond_list[pj].bo_data );
                     Di = workspace->Delta[i];
-                    vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                    vov3 = bo_ij->BO - Di - 0.040 * POW(Di, 4.);
 
-                    if( vov3 > 3. ) {
-                        data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0);
+                    if ( vov3 > 3. )
+                    {
+                        e_lph = p_lp3 * SQR( vov3 - 3.0 );
+                        data_elp[i] += e_lph;
 
-                        deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-                        deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                        deahu2dbo = 2. * p_lp3 * (vov3 - 3.);
+                        deahu2dsbo = 2. * p_lp3 * (vov3 - 3.) *
+                            (-1. - 0.16 * POW(Di, 3.));
 
                         bo_ij->Cdbo += deahu2dbo;
                         workspace->CdDelta[i] += deahu2dsbo;
+
 #ifdef TEST_ENERGY
                         fprintf(out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n",
                                 system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
                                 e_lph, deahu2dbo, deahu2dsbo );
 #endif
+
 #ifdef TEST_FORCES
                         Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
                         Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
@@ -138,36 +143,36 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
                     }
                 }    
             }
+        }
+    }
     //}
 
-
     //for( i = 0; i < system->n; ++i ) {
     type_i = my_atoms[i].type;
     sbp_i = &(sbp[ type_i ]);
 
     /* over-coordination energy */
     if( sbp_i->mass > 21.0 ) 
+    {
         dfvl = 0.0;
-    else dfvl = 1.0; // only for 1st-row elements
+    }
+    else
+    {
+        dfvl = 1.0; // only for 1st-row elements
+    }
 
     p_ovun2 = sbp_i->p_ovun2;
     sum_ovun1 = sum_ovun2 = 0;
-    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
         j = bonds->select.bond_list[pj].nbr;
         type_j = my_atoms[j].type;
         bo_ij = &(bonds->select.bond_list[pj].bo_data);
-        sbp_j = &(sbp[ type_j ]);
-        twbp = &(tbp[ index_tbp (type_i, type_j, num_atom_types )]);
+        twbp = &(tbp[ index_tbp(type_i, type_j, num_atom_types )]);
 
         sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
         sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
             ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-        /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n",
-          i+1, j+1,      
-          dfvl * workspace->Delta_lp_temp[j], 
-          sbp_j->nlp_opt,
-          workspace->nlp_temp[j] );*/
     }
 
     exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
@@ -181,7 +186,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8);
     CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-    data_eov [i] += e_ov = sum_ovun1 * CEover1;
+    e_ov = sum_ovun1 * CEover1;
+    data_eov[i] += e_ov;
 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
@@ -191,7 +197,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) * 
         p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
-
     /* under-coordination potential */
     p_ovun2 = sbp_i->p_ovun2;
     p_ovun5 = sbp_i->p_ovun5;
@@ -202,8 +207,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
     inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-    data_eun [i] += e_un =
-        -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    data_eun[i] += e_un;
 
     CEunder1 = inv_exp_ovun2n * 
         ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
@@ -213,7 +218,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
         p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
-
     /* forces */
     workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
     workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
@@ -223,14 +227,14 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor 1st
 #endif
 
-    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
         pbond = &(bonds->select.bond_list[pj]);
         j = pbond->nbr;
         bo_ij = &(pbond->bo_data);
-        twbp  = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[pbond->nbr].type, 
+        twbp  = &(tbp[ index_tbp(my_atoms[i].type, my_atoms[pbond->nbr].type, 
                     num_atom_types) ]);
 
-
         bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st 
         //workspace->CdDelta[j] += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
         pbond->ae_CdDelta += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
@@ -240,7 +244,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
         bo_ij->Cdbopi2 += CEover4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // OvCoor-3b
 
-
         //workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
         pbond->ae_CdDelta += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
             (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
@@ -249,7 +252,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
         bo_ij->Cdbopi2 += CEunder4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
 
-
 #ifdef TEST_ENERGY
         /*      fprintf( out_control->eov, "%6d%12.6f\n", 
               workspace->reverse_map[j], 
@@ -319,30 +321,35 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms,
     //}
 }
 
-CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess ( reax_list p_bonds, 
+
+CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list p_bonds, 
         storage p_workspace, int n )
 {
-    int i,pj;
-    bond_data *pbond, *sbond;
+    int i, pj;
+    bond_data *sbond;
+//    bond_data *pbond;
     bond_data *sym_index_bond;
-
-    reax_list *bonds = &p_bonds;
-    storage *workspace = &p_workspace;
+    reax_list *bonds;
+    storage *workspace;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n) return;
 
-    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+    if ( i >= n )
+    {
+        return;
+    }
 
-        /*
-           pbond = &(bonds->select.bond_list[pj]);
-           dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
-           workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta;
-         */
+    bonds = &p_bonds;
+    workspace = &p_workspace;
 
-        sbond = &(bonds->select.bond_list [pj]);
-        sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); 
-        workspace->CdDelta [i] += sym_index_bond->ae_CdDelta;
+    for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
+//        pbond = &(bonds->select.bond_list[pj]);
+//        dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
+//        workspace->CdDelta[i] += dbond_index_bond->ae_CdDelta;
 
+        sbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); 
+        workspace->CdDelta[i] += sym_index_bond->ae_CdDelta;
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h
new file mode 100644
index 0000000000000000000000000000000000000000..06014b3ae777aff567e7eda0c5b6bb6584074eef
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.h
@@ -0,0 +1,35 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_MULTI_BODY_H_
+#define __CUDA_MULTI_BODY_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real *, real *, real *);
+
+CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list, storage, int );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ba30fa08aed74a20517d3716f2870aa953e0b1c
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -0,0 +1,659 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_neighbors.h"
+
+#include "cuda_list.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
+
+
+CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
+{
+    int  i;  
+    real d_sqr = 0.0;
+
+    for( i = 0; i < 3; ++i )
+    {
+        if( cp[i] > NEG_INF )
+        {
+            d_sqr += SQR( cp[i] - x[i] );
+        }
+    }
+
+    return d_sqr;
+}
+
+
+/* Generate far neighbor lists by scanning the atoms list and applying cutoffs */
+CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, 
+        simulation_box my_ext_box, grid g, reax_list far_nbrs_list, int n, int N,
+        int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs )
+{
+    int i, j, k, l, m, itr, num_far, my_num_far;
+    real d, cutoff;
+    ivec c, nbrs_x;
+    rvec dvec;
+    far_neighbor_data *nbr_data;
+    reax_atom *atom1, *atom2;
+
+    l = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( l >= N )
+    {
+        return;
+    }
+
+    atom1 = &( my_atoms[l] );
+    num_far = Dev_Start_Index( l, &far_nbrs_list );
+
+    /* get the coordinates of the atom and compute the grid cell */
+    if ( l < n )
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);   
+            if ( c[i] >= g.native_end[i] )
+            {
+                c[i] = g.native_end[i] - 1;
+            }
+            else if ( c[i] < g.native_str[i] )
+            {
+                c[i] = g.native_str[i];
+            }
+        }
+    }
+    else
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+            if ( c[i] < 0 )
+            {
+                c[i] = 0;
+            }
+            else if ( c[i] >= g.ncells[i] )
+            {
+                c[i] = g.ncells[i] - 1;
+            }
+        }
+    }
+
+    i = c[0];
+    j = c[1];
+    k = c[2];
+
+    cutoff = SQR( g.cutoff[index_grid_3d(i, j, k, &g)] );
+
+    /* scan neighboring grid cells within cutoff */
+    itr = 0;
+    while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
+    { 
+        ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
+
+        /* if neighboring grid cell is further in the "positive" direction AND within cutoff */
+        if ( g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
+        {
+            /* pick up another atom from the neighbor cell */
+            for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                    m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+            {
+                /* prevent recounting same pairs within a gcell */
+                if ( l < m )
+                {
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom2->x[0] - atom1->x[0];
+                    dvec[1] = atom2->x[1] - atom1->x[1];
+                    dvec[2] = atom2->x[2] - atom1->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+
+                    if ( d <= cutoff )
+                    { 
+                        /* commit far neighbor to list */
+                        nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]);
+                        nbr_data->nbr = m;
+                        nbr_data->d = SQRT( d );
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        ivec_ScaledSum( nbr_data->rel_box, 1,
+                                g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d(i, j, k, &g)] );
+
+                        ++num_far;
+                    }
+                }
+            }
+        }
+
+        ++itr;
+    }   
+
+    /* scan neighboring grid cells within cutoff */
+    itr = 0;
+    while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
+    { 
+        ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
+        cutoff = SQR( g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] );
+
+        /* if neighboring grid cell is further in the "negative" direction AND within cutoff */
+        if ( g.str[index_grid_3d(i, j, k, &g)] >= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
+        {
+            /* pick up another atom from the neighbor cell */
+            for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                    m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+            {
+                /* prevent recounting same pairs within a gcell */
+                if ( l > m )
+                {
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom1->x[0] - atom2->x[0];
+                    dvec[1] = atom1->x[1] - atom2->x[1];
+                    dvec[2] = atom1->x[2] - atom2->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+
+                    if ( d <= cutoff )
+                    {
+                        /* commit far neighbor to list */
+                        nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]);
+                        nbr_data->nbr = m;
+                        nbr_data->d = SQRT( d );
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        ivec_ScaledSum( nbr_data->rel_box, 1,
+                                g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d(i, j, k, &g)] );
+
+                        ++num_far;
+                    }
+                }   
+            }
+        }
+
+        ++itr;
+    }   
+
+    Dev_Set_End_Index( l, num_far, &far_nbrs_list );
+
+    /* reallocation check */
+    my_num_far = num_far - Dev_Start_Index( l, &far_nbrs_list );
+    if ( my_num_far > max_far_nbrs[l] )
+    {
+        *realloc_far_nbrs = TRUE;
+    }
+}
+
+
+//CUDA_GLOBAL void __launch_bounds__ (1024) k_mt_generate_neighbor_lists( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms, 
+        simulation_box my_ext_box, grid g, reax_list far_nbrs, int n, int N )
+{
+    extern __shared__ int __nbr[];
+    bool nbrgen;
+    int __THREADS_PER_ATOM__, thread_id, group_id, lane_id, my_bucket;
+    int *tnbr, *nbrssofar;
+    int max, leader, loopcount, iterations;
+    int i, j, k, l, m, itr, num_far, ll;
+    real d, cutoff, cutoff_ji;
+    ivec c, nbrs_x;
+    rvec dvec;
+    far_neighbor_data *nbr_data, *my_start;
+    reax_atom *atom1, *atom2;
+
+    __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM;
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    group_id = thread_id / __THREADS_PER_ATOM__;
+
+    if ( group_id >= N )
+    {
+        return;
+    }
+
+    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); 
+    my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+    tnbr = __nbr;
+    nbrssofar = __nbr + blockDim.x;
+    l = group_id;
+    atom1 = &(my_atoms[l]);
+    num_far = Dev_Start_Index( l, &far_nbrs );
+    my_start = &( far_nbrs.select.far_nbr_list[num_far] );
+
+    //get the coordinates of the atom and 
+    //compute the grid cell
+    if ( l < n )
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+            if ( c[i] >= g.native_end[i] )
+            {
+                c[i] = g.native_end[i] - 1;
+            }
+            else if ( c[i] < g.native_str[i] )
+            {
+                c[i] = g.native_str[i];
+            }
+        }
+    }
+    else
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+            if ( c[i] < 0 )
+            {
+                c[i] = 0;
+            }
+            else if ( c[i] >= g.ncells[i] )
+            {
+                c[i] = g.ncells[i] - 1;
+            }
+        }
+    }
+
+    i = c[0];
+    j = c[1];
+    k = c[2];
+
+    tnbr[threadIdx.x] = 0;
+    if ( lane_id == 0 )
+    {
+        nbrssofar[my_bucket] = 0;
+    }
+    __syncthreads( );
+
+    itr = 0;
+    while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
+    { 
+        tnbr[threadIdx.x] = 0;
+        nbrgen = false;
+
+        ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
+
+        cutoff = SQR( g.cutoff[index_grid_3d(i, j, k, &g)] );
+        cutoff_ji = SQR( g.cutoff[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] );
+
+        if ( (g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] 
+                && Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff) 
+                || (g.str[index_grid_3d(i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] 
+                && Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff_ji) )
+        {
+            max = g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]
+                    - g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)];
+            tnbr[threadIdx.x] = 0;
+            nbrgen = false;
+            m = lane_id  + g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31
+            loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
+            iterations = 0;
+
+            // pick up another atom from the neighbor cell
+            while ( iterations < loopcount )
+            {
+                tnbr[threadIdx.x] = 0;
+                nbrgen = false;
+
+                // prevent recounting same pairs within a gcell 
+                if ( l < m  && m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] )
+                {
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom2->x[0] - atom1->x[0];
+                    dvec[1] = atom2->x[1] - atom1->x[1];
+                    dvec[2] = atom2->x[2] - atom1->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+
+                    if ( d <= cutoff )
+                    { 
+                        tnbr [threadIdx.x] = 1;
+                        nbrgen = true;
+                    }
+                }
+
+                if ( l > m  && m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] )
+                {
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom1->x[0] - atom2->x[0];
+                    dvec[1] = atom1->x[1] - atom2->x[1];
+                    dvec[2] = atom1->x[2] - atom2->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+
+                    if ( d <= cutoff_ji )
+                    {
+                        tnbr [threadIdx.x] = 1;
+                        nbrgen = true;
+                    }
+                } 
+
+                //is neighbor generated
+                if ( nbrgen )
+                {
+                    //do leader selection here
+                    leader = -1;
+                    for ( ll = my_bucket *__THREADS_PER_ATOM__;
+                            ll < (my_bucket) * __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++ )
+                    {
+                        if ( tnbr[ll] )
+                        {
+                            leader = ll;
+                            break;
+                        }
+                    }
+
+                    //do the reduction;
+                    if ( threadIdx.x == leader )
+                    {
+                        for ( ll = 1; ll < __THREADS_PER_ATOM__; ll++ )
+                        {
+                            tnbr[my_bucket * __THREADS_PER_ATOM__ + ll]
+                                    += tnbr[my_bucket * __THREADS_PER_ATOM__ + (ll-1)];
+                        }
+                    }
+                }
+
+                if ( nbrgen )
+                {
+                    //got the indices
+                    nbr_data = my_start + nbrssofar[my_bucket] + tnbr[threadIdx.x] - 1;
+                    nbr_data->nbr = m;
+
+                    if ( l < m )
+                    {
+                        dvec[0] = atom2->x[0] - atom1->x[0];
+                        dvec[1] = atom2->x[1] - atom1->x[1];
+                        dvec[2] = atom2->x[2] - atom1->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+                        nbr_data->d = SQRT( d );
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d(i, j, k, &g)] );
+                    } 
+                    else
+                    {
+                        dvec[0] = atom1->x[0] - atom2->x[0];
+                        dvec[1] = atom1->x[1] - atom2->x[1];
+                        dvec[2] = atom1->x[2] - atom2->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+                        nbr_data->d = SQRT( d );
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        /*
+                           CHANGE ORIGINAL
+                           This is a bug in the original code 
+                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
+                         */
+                        ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                1, g.rel_box[index_grid_3d(i, j, k, &g)] );
+                    }
+
+                    if ( threadIdx.x == leader )
+                    {
+                        nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                    }
+                }
+
+                m += __THREADS_PER_ATOM__;
+                iterations++;
+
+                //cleanup
+                nbrgen = false;
+                tnbr[threadIdx.x] = 0;
+            }
+        }
+        ++itr;
+    }
+
+    if ( lane_id == 0 )
+    {
+        Dev_Set_End_Index( l, num_far + nbrssofar[my_bucket], &far_nbrs );
+    }
+}
+
+
+int Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
+        storage *workspace, reax_list **lists )
+{
+    int blocks, ret, ret_far_nbr;
+#if defined(LOG_PERFORMANCE)
+    real t_start = 0, t_elapsed = 0;
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        t_start = Get_Time( );
+    }
+#endif
+
+    /* reset reallocation flag on device */
+    /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment
+     * to the provided literal */
+    cuda_memset( system->d_realloc_far_nbrs, FALSE, sizeof(int), 
+            "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
+
+    /* one thread per atom implementation */
+    blocks = (system->N / NBRS_BLOCK_SIZE) +
+        ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+          *(*dev_lists + FAR_NBRS), system->n, system->N,
+          system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* multiple threads per atom implementation */
+//    blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
+//        (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+//    k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, 
+//        //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>>
+//        sizeof(int) * 2 * NBRS_BLOCK_SIZE >>>
+//            ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+//              *(*dev_lists + FAR_NBRS), system->n, system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    /* check reallocation flag on device */
+    copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
+
+    ret = (ret_far_nbr == FALSE) ? SUCCESS : FAILURE;
+    dev_workspace->realloc.far_nbrs = ret_far_nbr;
+
+#if defined(LOG_PERFORMANCE)
+    if( system->my_rank == MASTER_NODE )
+    {
+        t_elapsed = Get_Timing_Info( t_start );
+        data->timing.nbrs += t_elapsed;
+    }
+#endif
+
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "p%d @ step%d: nbrs done\n", 
+            system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
+
+
+/* Estimate the number of far neighbors per atom (GPU) */
+CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, 
+        simulation_box my_ext_box, grid g, int n, int N, int total_cap,
+        int *far_nbrs, int *max_far_nbrs )
+{
+    int i, j, k, l, m, itr, num_far;
+    real d, cutoff;
+    ivec c, nbrs_x;
+    rvec dvec;
+    reax_atom *atom1, *atom2;
+
+    l = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( l >= total_cap )
+    {
+        return;
+    }
+
+    if ( l < N )
+    {
+        num_far = 0;
+        atom1 = &(my_atoms[l]);
+
+        /* get the coordinates of the atom and compute the grid cell
+         * if atom is locally owned by processor AND not ghost atom */
+        if ( l < n )
+        {
+            for ( i = 0; i < 3; i++ )
+            {
+                c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);   
+                if ( c[i] >= g.native_end[i] )
+                {
+                    c[i] = g.native_end[i] - 1;
+                }
+                else if ( c[i] < g.native_str[i] )
+                {
+                    c[i] = g.native_str[i];
+                }
+            }
+        }
+        /* same as above, but for ghost atoms */
+        else
+        {
+            for ( i = 0; i < 3; i++ )
+            {
+                c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+                if ( c[i] < 0 )
+                {
+                    c[i] = 0;
+                }
+                else if ( c[i] >= g.ncells[i] )
+                {
+                    c[i] = g.ncells[i] - 1;
+                }
+            }
+        }
+
+        i = c[0];
+        j = c[1];
+        k = c[2];
+
+        cutoff = SQR( g.cutoff[ index_grid_3d(i, j, k, &g) ] );
+
+        itr = 0;
+        while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
+        { 
+            ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
+
+            if ( //(g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) &&  
+                    Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff ) 
+            {
+                /* pick up another atom from the neighbor cell */
+                for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                        m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+                {
+                    /* prevent recounting same pairs within a gcell */
+                    if ( l < m )
+                    {
+                        atom2 = &(my_atoms[m]);
+                        dvec[0] = atom2->x[0] - atom1->x[0];
+                        dvec[1] = atom2->x[1] - atom1->x[1];
+                        dvec[2] = atom2->x[2] - atom1->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+
+                        if( d <= cutoff )
+                        { 
+                            num_far++;
+                        }
+                    }   
+                }
+            }
+            ++itr;
+
+        }   
+
+        itr = 0;
+        while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
+        {
+            ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
+            cutoff = SQR( g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] );
+
+            if ( g.str[index_grid_3d(i, j, k, &g)] >= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                    Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)],atom1->x) <= cutoff ) 
+            {
+                /* pick up another atom from the neighbor cell */
+                for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                        m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+                {
+                    /* prevent recounting same pairs within a gcell */
+                    if ( l > m )
+                    {
+                        atom2 = &(my_atoms[m]);
+                        dvec[0] = atom2->x[0] - atom1->x[0];
+                        dvec[1] = atom2->x[1] - atom1->x[1];
+                        dvec[2] = atom2->x[2] - atom1->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+
+                        if ( d <= cutoff )
+                        { 
+                            num_far++;
+                        }
+                    }   
+                }
+            }
+            ++itr;
+        }   
+    }
+    else
+    {
+        /* used to trigger assignment of max_far_nbrs below */
+        num_far = MIN_NBRS;
+    }
+
+    far_nbrs[l] = num_far;
+    max_far_nbrs[l] = MAX( (int)(num_far * SAFE_ZONE), MIN_NBRS );
+}
+
+
+/* Estimate the number of far neighbors for each atoms 
+ *
+ * system: atomic system info
+ * returns: SUCCESS if reallocation of the far neighbors list is necessary
+ *  based on current per-atom far neighbor limits, FAILURE otherwise */
+void Cuda_Estimate_Neighbors( reax_system *system )
+{
+    int blocks;
+
+    blocks = system->total_cap / DEF_BLOCK_SIZE + 
+        ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+          system->n, system->N, system->total_cap,
+          system->d_far_nbrs, system->d_max_far_nbrs );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs,
+            system->total_cap );
+    copy_host_device( &(system->total_far_nbrs), system->d_total_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" );
+}
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7bfc4b96f4a015e53840aa3d4ba8b62d91969cc
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.h
@@ -0,0 +1,23 @@
+
+#ifndef __CUDA_NEIGHBORS_H__
+#define __CUDA_NEIGHBORS_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** );
+
+void Cuda_Estimate_Neighbors( reax_system * );
+
+void Cuda_Init_Neighbor_Indices( reax_system * );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
new file mode 100644
index 0000000000000000000000000000000000000000..473ef77e6c1259eed6e7fd204fc79dcb36784b6d
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -0,0 +1,634 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_nonbonded.h"
+
+#include "cuda_list.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+#include "cuda_shuffle.h"
+
+#include "../index_utils.h"
+#include "../vector.h"
+
+
+//CUDA_GLOBAL void __launch_bounds__ (960) k_vdW_coulomb_energy(    
+CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, 
+        two_body_parameters *tbp, global_parameters gp, control_params *control, 
+        storage p_workspace, reax_list p_far_nbrs, int n, int N, int num_atom_types, 
+        real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
+{
+#if defined(__SM_35__)
+    real sh_vdw;
+    real sh_ele;
+    rvec sh_force;
+#else
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    extern __shared__ rvec _force[];
+    real *sh_vdw;
+    real *sh_ele;
+    rvec *sh_force;
+#endif
+    int i, j, pj, natoms;
+    int start_i, end_i, orig_i, orig_j;
+    real p_vdW1, p_vdW1i;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, r_ij, fn13, exp1, exp2;
+    real Tap, dTap, dfn13, CEvd, CEclmb, de_core;
+    real dr3gamij_1, dr3gamij_3;
+    real e_ele, e_vdW, e_core;
+    rvec temp, ext_press;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_list *far_nbrs;
+    storage *workspace = &( p_workspace );
+    int thread_id;
+    int warpid;
+    int laneid; 
+
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
+    laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
+#if defined(__SM_35__)
+    sh_vdw = 0.0;
+    sh_ele = 0.0;
+    rvec_MakeZero ( sh_force );
+#else
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_force = (rvec *)( _vdw + 2*blockDim.x);
+
+    sh_vdw[threadIdx.x] = 0.0;
+    sh_ele[threadIdx.x] = 0.0;
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+#endif
+    //i = blockIdx.x * blockDim.x + threadIdx.x;
+    //if (i >= N) return;
+    i = warpid;
+
+    if ( i < N )
+    {
+        natoms = n;
+        far_nbrs = &( p_far_nbrs );
+        p_vdW1 = gp.l[28];
+        p_vdW1i = 1.0 / p_vdW1;
+        e_core = 0;
+        e_vdW = 0;
+
+        data_e_vdW[i] = 0;
+        data_e_ele[i] = 0;
+
+        //for( i = 0; i < natoms; ++i ) {
+        start_i = Dev_Start_Index(i, far_nbrs);
+        end_i = Dev_End_Index(i, far_nbrs);
+        orig_i = my_atoms[i].orig_id;
+        //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
+
+        //for( pj = start_i; pj < end_i; ++pj )
+        pj = start_i + laneid;
+        while ( pj < end_i )
+        {
+
+            nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+            j = nbr_pj->nbr;
+            orig_j  = my_atoms[j].orig_id;
+
+            if( nbr_pj->d <= control->nonb_cut && 
+                    (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
+                     || ((i > j) && (i < natoms) && (j < natoms)) 
+                     || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
+            { // ji with j >= n
+                r_ij = nbr_pj->d;
+                twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]);
+
+                /* Calculate Taper and its derivative */
+                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
+                Tap = Tap * r_ij + workspace->Tap[5];
+                Tap = Tap * r_ij + workspace->Tap[4];
+                Tap = Tap * r_ij + workspace->Tap[3];
+                Tap = Tap * r_ij + workspace->Tap[2];
+                Tap = Tap * r_ij + workspace->Tap[1];
+                Tap = Tap * r_ij + workspace->Tap[0];
+
+                dTap = 7 * workspace->Tap[7] * r_ij + 6 * workspace->Tap[6];
+                dTap = dTap * r_ij + 5*workspace->Tap[5];
+                dTap = dTap * r_ij + 4*workspace->Tap[4];
+                dTap = dTap * r_ij + 3*workspace->Tap[3];
+                dTap = dTap * r_ij + 2*workspace->Tap[2];
+                dTap += workspace->Tap[1] / r_ij;
+
+                /* shielding vdWaals Calculations */
+                if ( gp.vdw_type == 1 || gp.vdw_type == 3 )
+                {
+                    powr_vdW1 = POW( r_ij, p_vdW1 );
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1 );
+
+                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+                    e_vdW = twbp->D * (exp1 - 2.0 * exp2);      
+
+                    //data_e_vdW[i] += Tap * e_vdW;
+                    //data_e_vdW[i] += Tap * e_vdW / 2.0;
+#if defined(__SM_35__)
+                    sh_vdw += Tap * e_vdW / 2.0;
+#else
+                    sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0;
+#endif
+
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                        POW( r_ij, p_vdW1 - 2.0 );
+
+                    CEvd = dTap * e_vdW - 
+                        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+                }
+                /* no shielding */
+                else
+                {
+                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+
+                    e_vdW = twbp->D * (exp1 - 2.0 * exp2);
+
+                    //data_e_vdW[i] += Tap * e_vdW;
+                    //data_e_vdW[i] += Tap * e_vdW / 2.0;
+#if defined(__SM_35__)
+                    sh_vdw += Tap * e_vdW / 2.0;
+#else
+                    sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0;
+#endif
+
+                    CEvd = dTap * e_vdW - 
+                        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+                }
+
+                /* inner wall */
+                if ( gp.vdw_type == 2 || gp.vdw_type == 3 )
+                {
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+
+                    //data_e_vdW[i] += Tap * e_core;
+                    //data_e_vdW[i] += Tap * e_core / 2.0;
+#if defined(__SM_35__)
+                    sh_vdw += Tap * e_core / 2.0;
+#else
+                    sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0;
+#endif
+
+                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    CEvd += dTap * e_core + Tap * de_core;
+                }
+
+                /*Coulomb Calculations*/
+                dr3gamij_1 = r_ij * r_ij * r_ij + twbp->gamma;
+                dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 );
+
+                tmp = Tap / dr3gamij_3;
+                //data_e_ele[i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
+                e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
+                //data_e_ele[i] += e_ele;
+                //data_e_ele[i] += e_ele  / 2.0;
+#if defined(__SM_35__)
+                sh_ele += e_ele  / 2.0;
+#else
+                sh_ele[ threadIdx.x ] += e_ele  / 2.0;
+#endif
+
+                CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * 
+                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+
+                // fprintf( fout, "%5d %5d %10.6f %10.6f\n",
+                //   MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ),
+                //   MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), 
+                //   CEvd, CEclmb );                  
+
+                if ( control->virial == 0 )
+                {
+                    if ( i < j ) 
+                    {
+                        //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+#if defined (__SM_35__)
+                        rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec );
+#else
+                        rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec );
+#endif
+                    }
+                    else 
+                    {
+                        //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
+#if defined (__SM_35__)
+                        rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec );
+#else
+                        rvec_ScaledAdd( sh_force[ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
+#endif
+                        //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+                    }
+                }
+                /* NPT, iNPT or sNPT */
+                else
+                {
+                    /* for pressure coupling, terms not related to bond order 
+                       derivatives are added directly into pressure vector/tensor */
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                    rvec_ScaledAdd( workspace->f[i], -1., temp );
+                    rvec_Add( workspace->f[j], temp );
+
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_Add( data_ext_press [i], ext_press );
+
+                    // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)
+                    //   force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", 
+                    //   i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2],
+                    //   temp[0], temp[1], temp[2],
+                    //   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+                }
+
+#ifdef TEST_ENERGY
+                // fprintf( out_control->evdw, 
+                // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
+                // workspace->Tap[7],workspace->Tap[6],workspace->Tap[5],
+                // workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], 
+                // workspace->Tap[1], Tap );
+                //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+                fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
+                        system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                        r_ij, e_vdW, data->my_en.e_vdW );
+                //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                        system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                        r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
+                        e_ele, data->my_en.e_ele );
+#endif
+
+#ifdef TEST_FORCES
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+#endif
+            }
+
+            pj += VDW_KER_THREADS_PER_ATOM;
+
+        }
+        //  }
+    } // if i < N
+
+#if defined( __SM_35__)
+    for ( int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2 )
+    {
+        sh_vdw += shfl( sh_vdw, x);
+        sh_ele += shfl( sh_ele, x );
+        sh_force[0] += shfl( sh_force[0], x );
+        sh_force[1] += shfl( sh_force[1], x );
+        sh_force[2] += shfl( sh_force[2], x );
+    }
+
+    if ( laneid == 0 )
+    {
+        data_e_vdW[i] += sh_vdw;
+        data_e_ele[i] += sh_ele;
+        rvec_Add( workspace->f[i], sh_force );
+    }
+
+#else
+
+    __syncthreads( );
+
+    if (laneid < 16)
+    {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 16] );
+    }
+    __syncthreads( );
+    if (laneid < 8)
+    {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 8] );
+    }
+    __syncthreads( );
+    if (laneid < 4)
+    {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 4] );
+    }
+    __syncthreads( );
+    if (laneid < 2)
+    {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 2] );
+    }
+    __syncthreads( );
+    if (laneid < 1)
+    {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 1] );
+    }
+    __syncthreads( );
+    if (laneid == 0)
+    {
+        data_e_vdW[i] += sh_vdw[threadIdx.x];
+        data_e_ele[i] += sh_ele[threadIdx.x];
+        rvec_Add( workspace->f[i], sh_force[ threadIdx.x ] );
+    }
+#endif
+
+}
+
+
+CUDA_GLOBAL void k_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
+        global_parameters gp, control_params *control, 
+        storage p_workspace, reax_list p_far_nbrs, 
+        LR_lookup_table *t_LR, int n, int N, int num_atom_types, 
+        int step, int prev_steps, int energy_update_freq, 
+        real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
+{
+    int i, j, pj, r, natoms, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i, orig_i, orig_j;
+    real r_ij, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    reax_list *far_nbrs;
+    LR_lookup_table *t;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    natoms = n;
+    far_nbrs = &( p_far_nbrs );
+    steps = step - prev_steps;
+    update_freq = energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+    e_ele = e_vdW = 0;
+    data_e_vdW[i] = 0;
+    data_e_ele[i] = 0;
+
+    //for( i = 0; i < natoms; ++i ) {
+    type_i = my_atoms[i].type;
+    start_i = Dev_Start_Index(i,far_nbrs);
+    end_i = Dev_End_Index(i,far_nbrs);
+    orig_i = my_atoms[i].orig_id;
+
+    for( pj = start_i; pj < end_i; ++pj )
+    {
+        nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+        j = nbr_pj->nbr;
+        orig_j  = my_atoms[j].orig_id;
+
+        //if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) {
+        if( nbr_pj->d <= control->nonb_cut && 
+                (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
+                 || ((i > j) && (i < natoms) && (j < natoms)) 
+                 || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
+        { // ji with j >= n
+            j = nbr_pj->nbr;
+            type_j = my_atoms[j].type;
+            r_ij   = nbr_pj->d;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+
+            t = &( t_LR[ index_lr(tmin, tmax, num_atom_types) ]);    
+
+            //table = &( LR[type_i][type_j] ); 
+
+            /* Cubic Spline Interpolation */
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )
+            {
+                ++r;
+            }
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
+
+            if ( update_energies )
+            {
+                e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                    t->vdW[r].a;
+
+                e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                    t->ele[r].a;
+                e_ele *= my_atoms[i].q * my_atoms[j].q;
+
+                //data_e_vdW[i] += e_vdW;
+                data_e_vdW[i] += e_vdW / 2.0;
+                //data_e_ele[i] += e_ele;
+                data_e_ele[i] += e_ele / 2.0;
+            }    
+
+            CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif + 
+                t->CEvd[r].a;
+
+            CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif + 
+                t->CEclmb[r].a;
+            CEclmb *= my_atoms[i].q * my_atoms[j].q;
+
+            if( control->virial == 0 )
+            {
+                if ( i < j ) 
+                {
+                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                }
+                else 
+                {
+                    rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
+                }
+                //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+            }
+            /* NPT, iNPT or sNPT */
+            else
+            {
+                /* for pressure coupling, terms not related to bond order derivatives
+                   are added directly into pressure vector/tensor */
+                rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                rvec_ScaledAdd( workspace->f[i], -1., temp );
+                rvec_Add( workspace->f[j], temp );
+
+                rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                rvec_Add( data_ext_press [i], ext_press );
+            }
+
+#ifdef TEST_ENERGY
+            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
+                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                    r_ij, e_vdW, data->my_en.e_vdW );
+            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                    r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
+                    e_ele, data->my_en.e_ele );
+#endif
+
+#ifdef TEST_FORCES
+            rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+#endif
+        }
+    }
+    //  }
+}
+
+
+CUDA_GLOBAL void k_pol_energy( reax_atom *my_atoms, 
+        single_body_parameters *sbp, int n, real *data_e_pol )
+{
+    int i, type_i;
+    real q;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    q = my_atoms[i].q;
+    type_i = my_atoms[i].type;
+
+    data_e_pol[i] = 
+        KCALpMOL_to_EV * (sbp[type_i].chi * q + 
+                (sbp[type_i].eta / 2.) * SQR(q));
+}
+
+
+void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *data )
+{
+    int blocks;
+    real *spad = (real *) scratch;
+
+    cuda_memset( spad, 0, sizeof(real) * 2 * system->n, "pol_energy" );
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+          system->n, spad );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( spad,
+            &((simulation_data *)data->d_simulation_data)->my_en.e_pol,
+            system->n );
+}
+
+
+void Cuda_NonBonded_Energy( reax_system *system, control_params *control, 
+        storage *workspace, simulation_data *data,  reax_list **lists,
+        output_controls *out_control, bool isTabulated )
+{
+    int blocks, rblocks, update_energy;
+    int size = (2 * system->N + 2 * system->N ) * sizeof(real) +
+        2 * system->N * sizeof(rvec);
+    rvec *spad_rvec;
+    real *spad = (real *) scratch;
+
+    update_energy = (out_control->energy_update_freq > 0
+            && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
+    rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) 
+        + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    cuda_memset( spad, 0, size, "pol_energy" );
+
+    if ( !isTabulated )
+    {
+        k_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
+            ( system->d_my_atoms, system->reax_param.d_tbp, 
+              system->reax_param.d_gp, (control_params *)control->d_control_params, 
+              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
+              system->n, system->N, system->reax_param.num_atom_types, 
+              spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N));
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+    }
+    else
+    {
+        k_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, 
+              (control_params *)control->d_control_params, 
+              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
+              d_LR, system->n, system->N,
+              system->reax_param.num_atom_types, 
+              data->step, data->prev_steps, 
+              out_control->energy_update_freq,
+              spad, spad + 2 * system->N, 
+              (rvec *)(spad + 4 * system->N));
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+    }
+
+    /* reduction for vdw */
+    if ( update_energy == TRUE )
+    {
+        Cuda_Reduction_Sum( spad,
+                &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+                system->N );
+    }
+
+    /* reduction for ele */
+    if ( update_energy == TRUE )
+    {
+        Cuda_Reduction_Sum( spad + 2 * system->N,
+                &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+                system->N );
+    }
+
+    /* reduction for ext_press */
+    spad_rvec = (rvec *) (spad + 4 * system->N);
+    k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof(rvec) * DEF_BLOCK_SIZE >>>
+        ( spad_rvec, spad_rvec + system->N, system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof(rvec) * BLOCKS_POW_2_N>>>
+        ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    if ( update_energy == TRUE )
+    {
+        Cuda_Compute_Polarization_Energy( system, data );
+    }
+}
diff --git a/PG-PuReMD/src/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h
similarity index 79%
rename from PG-PuReMD/src/cuda_nonbonded.h
rename to PG-PuReMD/src/cuda/cuda_nonbonded.h
index 1c9916bfba8821ec353f6d1571ac24aa711348a4..238d49d748289da4152b70c5a1440cc8ffd611dd 100644
--- a/PG-PuReMD/src/cuda_nonbonded.h
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __NONBONDED_H_
-#define __NONBONDED_H_
+#ifndef __CUDA_NONBONDED_H_
+#define __CUDA_NONBONDED_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 void Cuda_Compute_Polarization_Energy( reax_system *, simulation_data *);
-void Cuda_NonBonded_Energy ( reax_system *, control_params *,
-                             storage *, simulation_data *, reax_list **,
-                             output_controls *, bool );
+
+void Cuda_NonBonded_Energy( reax_system *, control_params *,
+        storage *, simulation_data *, reax_list **,
+        output_controls *, bool );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
similarity index 70%
rename from PG-PuReMD/src/cuda_post_evolve.cu
rename to PG-PuReMD/src/cuda/cuda_post_evolve.cu
index b8008e85545742e8588060c112dcbb058238a185..828a0e4beff46f591b34b8727357a5ccb4ddf742 100644
--- a/PG-PuReMD/src/cuda_post_evolve.cu
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
@@ -1,15 +1,21 @@
 
 #include "cuda_post_evolve.h"
-#include "reax_types.h"
-#include "vector.h"
+
 #include "cuda_utils.h"
 
-CUDA_GLOBAL void ker_post_evolve (reax_atom *my_atoms, 
-        simulation_data *data, int n)
+#include "../vector.h"
+
+
+CUDA_GLOBAL void ker_post_evolve( reax_atom *my_atoms, 
+        simulation_data *data, int n )
 {
     rvec diff, cross;
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if (i >= n)
+    {
+        return;
+    }
 
     //for( i = 0; i < system->n; i++ ) { 
     /* remove translational vel */
@@ -22,7 +28,8 @@ CUDA_GLOBAL void ker_post_evolve (reax_atom *my_atoms,
     //}  
 }
 
-void post_evolve_velocities (reax_system *system, simulation_data *data)
+
+void post_evolve_velocities( reax_system *system, simulation_data *data )
 {
     int blocks;
 
@@ -30,6 +37,6 @@ void post_evolve_velocities (reax_system *system, simulation_data *data)
         ((system->n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
     ker_post_evolve <<< blocks, DEF_BLOCK_SIZE >>>
         (system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 }
diff --git a/PG-PuReMD/src/cuda_post_evolve.h b/PG-PuReMD/src/cuda/cuda_post_evolve.h
similarity index 60%
rename from PG-PuReMD/src/cuda_post_evolve.h
rename to PG-PuReMD/src/cuda/cuda_post_evolve.h
index dcdcd50cadef4db2c0c403c604bb4ecf33acd56b..a1a0571a9a9825f613bd01477f881201c621afa8 100644
--- a/PG-PuReMD/src/cuda_post_evolve.h
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_POST_EVOLVE_H__
 #define __CUDA_POST_EVOLVE_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void post_evolve_velocities (reax_system *, simulation_data *);
+void post_evolve_velocities( reax_system *, simulation_data * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_random.cu b/PG-PuReMD/src/cuda/cuda_random.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0de37009d890c9e17afbb20b4dc114c872998bd
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_random.cu
@@ -0,0 +1,65 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_random.h"
+
+
+/* System random number generator used linear congruance method with
+   large periodicity for generation of pseudo random number. function
+   Random returns this random number appropriately scaled so that
+   0 <= Random(range) < range */
+CUDA_DEVICE double Cuda_Random( double range )
+{
+    //TODO: use cuRAND
+//    return (random( ) * range) / 2147483647L;
+    return 0.0;
+}
+
+
+/* This function seeds the system pseudo random number generator with
+   current time. Use this function once in the begining to initialize
+   the system */
+void Cuda_Randomize( )
+{
+    //TODO: use cuRAND
+//    curandState_t state;
+//
+//    curand_init( time(NULL), 0, 0, &state );
+}
+
+
+/* GRandom return random number with gaussian distribution with mean
+   and standard deviation "sigma" */
+CUDA_DEVICE double Cuda_GRandom( double mean, double sigma )
+{
+    double v1 = Cuda_Random(2.0) - 1.0;
+    double v2 = Cuda_Random(2.0) - 1.0;
+    double rsq = v1 * v1 + v2 * v2;
+
+    while (rsq >= 1.0 || rsq == 0.0)
+    {
+        v1 = Cuda_Random(2.0) - 1.0;
+        v2 = Cuda_Random(2.0) - 1.0;
+        rsq = v1 * v1 + v2 * v2;
+    }
+
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
+}
diff --git a/PG-PuReMD/src/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_random.h
similarity index 57%
rename from PG-PuReMD/src/cuda_torsion_angles.h
rename to PG-PuReMD/src/cuda/cuda_random.h
index 235e91b0dfe3bc634491bdf9a484a8fe851c2013..388359c72fcfbead153c083fb2c2675531df84da 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_random.h
@@ -19,24 +19,26 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __TORSION_ANGLES_H_
-#define __TORSION_ANGLES_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *,
-                                      global_parameters ,
-                                      four_body_header *,
-                                      control_params *,
-                                      reax_list , reax_list ,
-                                      storage ,
-                                      int , int ,
-                                      real *, real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *,
-        storage ,
-        reax_list , int );
+#ifndef __CUDA_RANDOM_H_
+#define __CUDA_RANDOM_H_
+
+#include "../reax_types.h"
+
+
+/* System random number generator used linear congruance method with
+   large periodicity for generation of pseudo random number. function
+   Random returns this random number appropriately scaled so that
+   0 <= Random(range) < range */
+CUDA_DEVICE double Cuda_Random( double );
+
+/* This function seeds the system pseudo random number generator with
+   current time. Use this function once in the begining to initialize
+   the system */
+void Cuda_Randomize( );
+
+/* GRandom return random number with gaussian distribution with mean
+   and standard deviation "sigma" */
+CUDA_DEVICE double Cuda_GRandom( double, double );
+
 
 #endif
diff --git a/PG-PuReMD/src/reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
similarity index 58%
rename from PG-PuReMD/src/reduction.cu
rename to PG-PuReMD/src/cuda/cuda_reduction.cu
index 676d41647ab6690b25864055e6ea07e623f0befe..01bd3c8199a76144703738188713190e02b35e9f 100644
--- a/PG-PuReMD/src/reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -1,12 +1,183 @@
 
-#include "reduction.h"
-#include "vector.h"
+#include "cuda_reduction.h"
 
 #include "cuda_shuffle.h"
+#include "cuda_utils.h"
 
+#include "../vector.h"
 
-CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results,
-        const size_t n)
+#include "../cub/cub/device/device_reduce.cuh"
+#include "../cub/cub/device/device_scan.cuh"
+
+
+//struct RvecSum
+//{
+//    template <typename T>
+//    __device__ __forceinline__
+//    T operator()(const T &a, const T &b) const
+//    {
+//        b[0] = a[0] + b[0];
+//        b[1] = a[1] + b[1];
+//        b[2] = a[2] + b[2];
+//        return b;
+//    }
+//};
+
+
+/* Perform a device-wide reduction (sum operation)
+ *
+ * d_array: device array to reduce
+ * d_dest: device pointer to hold result of reduction */
+void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n )
+{
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    /* determine temporary device storage requirements */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "Cuda_Reduction_Sum::temp_storage" );
+
+    /* run sum-reduction */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" );
+}
+
+
+/* Perform a device-wide reduction (sum operation)
+ *
+ * d_array: device array to reduce
+ * d_dest: device pointer to hold result of reduction */
+void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
+{
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    /* determine temporary device storage requirements */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "Cuda_Reduction_Sum::temp_storage" );
+
+    /* run sum-reduction */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" );
+}
+
+
+///* Perform a device-wide reduction (sum operation)
+// *
+// * d_array: device array to reduce
+// * d_dest: device pointer to hold result of reduction */
+//void Cuda_Reduction_Sum( rvec *d_array, rvec *d_dest, size_t n )
+//{
+//    void *d_temp_storage = NULL;
+//    size_t temp_storage_bytes = 0;
+//    RvecSum sum_op;
+//    rvec init = {0.0, 0.0, 0.0};
+//
+//    /* determine temporary device storage requirements */
+//    cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes,
+//            d_array, d_dest, n, sum_op, init );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    /* allocate temporary storage */
+//    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+//            "cub::reduce::temp_storage" );
+//
+//    /* run sum-reduction */
+//    cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes,
+//            d_array, d_dest, n, sum_op, init );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    /* deallocate temporary storage */
+//    cuda_free( d_temp_storage, "cub::reduce::temp_storage" );
+//}
+
+
+/* Perform a device-wide reduction (max operation)
+ *
+ * d_array: device array to reduce
+ * d_dest: device pointer to hold result of reduction */
+void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n )
+{
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    /* determine temporary device storage requirements */
+    cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "Cuda_Reduction_Max::temp_storage" );
+
+    /* run exclusive prefix sum */
+    cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes,
+            d_array, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "Cuda_Reduction_Max::temp_storage" );
+}
+
+
+/* Perform a device-wide scan (partial sum operation)
+ *
+ * d_src: device array to scan
+ * d_dest: device array to hold result of scan */
+void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n )
+{
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    /* determine temporary device storage requirements */
+    cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
+            d_src, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "Cuda_Scan_Excl_Sum::temp_storage" );
+
+    /* run exclusive prefix sum */
+    cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
+            d_src, d_dest, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "Cuda_Scan_Excl_Sum::temp_storage" );
+}
+
+
+CUDA_GLOBAL void k_reduction( const real *input, real *per_block_results,
+        const size_t n )
 {
 #if defined(__SM_35__)
     extern __shared__ real my_results[];
@@ -79,12 +250,12 @@ CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results,
 }
 
 
-CUDA_GLOBAL void k_reduction_rvec(rvec *input, rvec *results, size_t n)
+CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
 {
 #if defined(__SM_35__)
     extern __shared__ rvec my_rvec[];
     rvec sdata;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x, z, offset;
 
     rvec_MakeZero( sdata );
 
@@ -93,9 +264,9 @@ CUDA_GLOBAL void k_reduction_rvec(rvec *input, rvec *results, size_t n)
         rvec_Copy( sdata, input[i] );
     }
 
-    __syncthreads();
+    __syncthreads( );
 
-    for( int z = 16; z >=1; z/=2 )
+    for( z = 16; z >=1; z/=2 )
     {
         sdata[0] += shfl( sdata[0], z);
         sdata[1] += shfl( sdata[1], z);
@@ -107,16 +278,16 @@ CUDA_GLOBAL void k_reduction_rvec(rvec *input, rvec *results, size_t n)
         rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata );
     }
 
-    __syncthreads ();
+    __syncthreads( );
 
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
+    for( offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
     {
         if( threadIdx.x < offset )
         {
             rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] );
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
     if( threadIdx.x == 0 )
@@ -126,39 +297,39 @@ CUDA_GLOBAL void k_reduction_rvec(rvec *input, rvec *results, size_t n)
 
 #else
     extern __shared__ rvec svec_data[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x, offset;
     rvec x;
 
     rvec_MakeZero( x );
 
-    if(i < n)
+    if ( i < n )
     {
         rvec_Copy( x, input[i] );
     }
 
-    rvec_Copy(svec_data[threadIdx.x], x);
-    __syncthreads();
+    rvec_Copy( svec_data[threadIdx.x], x );
+    __syncthreads( );
 
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    for ( offset = blockDim.x / 2; offset > 0; offset >>= 1 )
     {
-        if(threadIdx.x < offset)
+        if ( threadIdx.x < offset )
         {
-            rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
+            rvec_Add( svec_data[threadIdx.x], svec_data[threadIdx.x + offset] );
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
-    if(threadIdx.x == 0)
+    if ( threadIdx.x == 0 )
     {
-        //rvec_Copy (results[blockIdx.x], svec_data[0]);
-        rvec_Add (results[blockIdx.x], svec_data[0]);
+        //rvec_Copy( results[blockIdx.x], svec_data[0] );
+        rvec_Add( results[blockIdx.x], svec_data[0] );
     }
 #endif
 }
 
 
-CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
+CUDA_GLOBAL void k_reduction_rvec2( rvec2 *input, rvec2 *results, size_t n )
 {
 #if defined(__SM_35__)
     extern __shared__ rvec2 my_rvec2[];
@@ -168,27 +339,32 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
     sdata[0] = 0.0;
     sdata[1] = 0.0;
 
-    if(i < n){
+    if ( i < n )
+    {
         sdata[0] = input[i][0];
         sdata[1] = input[i][1];
     }
 
     __syncthreads();
 
-    for(int z = 16; z >=1; z/=2){
+    for(int z = 16; z >=1; z/=2)
+    {
         sdata[0] += shfl ( sdata[0], z);
         sdata[1] += shfl ( sdata[1], z);
     }
 
-    if (threadIdx.x % 32 == 0){
+    if (threadIdx.x % 32 == 0)
+    {
         my_rvec2[threadIdx.x >> 5][0] = sdata[0];
         my_rvec2[threadIdx.x >> 5][1] = sdata[1];
     }
 
     __syncthreads ();
 
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-        if(threadIdx.x < offset){
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
             my_rvec2[threadIdx.x][0] += my_rvec2[threadIdx.x + offset][0];
             my_rvec2[threadIdx.x][1] += my_rvec2[threadIdx.x + offset][1];
         }
@@ -196,7 +372,8 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
         __syncthreads();
     }
 
-    if(threadIdx.x == 0){
+    if(threadIdx.x == 0)
+    {
         results[blockIdx.x][0] = my_rvec2[0][0];
         results[blockIdx.x][1] = my_rvec2[0][1];
     }
@@ -240,7 +417,7 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
 }
 
 
-CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results,
+CUDA_GLOBAL void k_dot( const real *a, const real *b, real *per_block_results,
         const size_t n )
 {
 #if defined(__SM_35__)
@@ -314,7 +491,8 @@ CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results,
 }
 
 
-CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_t n, int pass)
+CUDA_GLOBAL void k_norm( const real *input, real *per_block_results,
+        const size_t n, int pass )
 {
 #if defined(__SM_35__)
     extern __shared__ real my_norm[];
@@ -323,14 +501,14 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
 
     if( i < n )
     {
-        snorm = SQR (input[i]);
+        snorm = SQR( input[i] );
     }
 
     __syncthreads();
 
     for(int z = 16; z >=1; z/=2)
     {
-        snorm += shfl ( snorm, z);
+        snorm += shfl ( snorm, z );
     }
 
     if (threadIdx.x % 32 == 0)
@@ -338,7 +516,7 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
         my_norm[threadIdx.x >> 5] = snorm;
     }
 
-    __syncthreads ();
+    __syncthreads( );
 
     for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
     {
@@ -362,7 +540,7 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
 
     if(i < n)
     {
-        x = SQR (input[i]);
+        x = SQR( input[i] );
     }
 
     snorm[threadIdx.x] = x;
@@ -386,8 +564,8 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
 }
 
 
-CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
-        const size_t n, int pass)
+CUDA_GLOBAL void k_norm_rvec2( const rvec2 *input, rvec2 *per_block_results,
+        const size_t n, int pass )
 {
 #if defined(__SM_35__)
     extern __shared__ rvec2 my_norm2[];
@@ -397,20 +575,23 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
 
     if(i < n)
     {
-        if (pass == INITIAL) {    
-            snorm2[0] = SQR (input[i][0]);
-            snorm2[1] = SQR (input[i][1]);
-        } else {
+        if (pass == INITIAL)
+        {
+            snorm2[0] = SQR( input[i][0] );
+            snorm2[1] = SQR( input[i][1] );
+        }
+        else
+        {
             snorm2[0] = input[i][0];
             snorm2[1] = input[i][1];
         }
     }
-    __syncthreads();
+    __syncthreads( );
 
     for(int z = 16; z >=1; z/=2)
     {
-        snorm2[0] += shfl ( snorm2[0], z);
-        snorm2[1] += shfl ( snorm2[1], z);
+        snorm2[0] += shfl( snorm2[0], z );
+        snorm2[1] += shfl( snorm2[1], z );
     }
 
     if (threadIdx.x % 32 == 0){
@@ -418,18 +599,21 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
         my_norm2[threadIdx.x >> 5][1] = snorm2[1];
     }
 
-    __syncthreads ();
+    __syncthreads( );
 
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-        if(threadIdx.x < offset){
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
             my_norm2[threadIdx.x][0] += my_norm2[threadIdx.x + offset][0];
             my_norm2[threadIdx.x][1] += my_norm2[threadIdx.x + offset][1];
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
-    if(threadIdx.x == 0) {
+    if(threadIdx.x == 0)
+    {
         per_block_results[blockIdx.x][0] = my_norm2[0][0];
         per_block_results[blockIdx.x][1] = my_norm2[0][1];
     }
@@ -444,8 +628,8 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
     {
         if( pass == INITIAL )
         {
-            x[0] = SQR (input[i][0]);
-            x[1] = SQR (input[i][1]);
+            x[0] = SQR( input[i][0] );
+            x[1] = SQR( input[i][1] );
         }
         else
         {
@@ -456,7 +640,7 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
 
     snorm2[threadIdx.x][0] = x[0];
     snorm2[threadIdx.x][1] = x[1];
-    __syncthreads();
+    __syncthreads( );
 
     for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
     {
@@ -466,7 +650,7 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
             snorm2[threadIdx.x][1] += snorm2[threadIdx.x + offset][1];
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
     if(threadIdx.x == 0)
@@ -507,8 +691,10 @@ CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
 
     __syncthreads ();
 
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-        if(threadIdx.x < offset){
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
             my_dot2[threadIdx.x][0] += my_dot2[threadIdx.x + offset][0];
             my_dot2[threadIdx.x][1] += my_dot2[threadIdx.x + offset][1];
         }
@@ -600,9 +786,9 @@ CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
 }
 
 
-CUDA_GLOBAL void k_rvec2_pbetad (rvec2 *dest, rvec2 *a, 
+CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *dest, rvec2 *a, 
         real beta0, real beta1, 
-        rvec2 *b, int n)
+        rvec2 *b, int n )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/PG-PuReMD/src/cuda/cuda_reduction.h b/PG-PuReMD/src/cuda/cuda_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf9efc5de885852a4c9909154ae01d27bfccc29a
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_reduction.h
@@ -0,0 +1,47 @@
+
+#ifndef __CUDA_REDUCTION_H__
+#define __CUDA_REDUCTION_H__
+
+#include "../reax_types.h"
+
+#define  INITIAL  0
+#define  FINAL    1
+
+
+void Cuda_Reduction_Sum( int *, int *, size_t );
+
+void Cuda_Reduction_Sum( real *, real *, size_t );
+
+//void Cuda_Reduction_Sum( rvec *, rvec *, size_t );
+
+void Cuda_Reduction_Max( int *, int *, size_t );
+
+void Cuda_Scan_Excl_Sum( int *, int *, size_t );
+
+CUDA_GLOBAL void k_reduction( const real *, real *, const size_t );
+
+CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t );
+
+CUDA_GLOBAL void k_reduction_rvec2( rvec2 *, rvec2 *, size_t );
+
+CUDA_GLOBAL void k_norm( const real *, real *, const size_t, int );
+
+CUDA_GLOBAL void k_dot( const real *, const real *, real *,
+        const size_t );
+
+CUDA_GLOBAL void k_vector_sum( real*, real, real*, real,
+        real*, int );
+
+CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *, rvec2 *, real, real,
+        rvec2 *, int );
+
+CUDA_GLOBAL void k_rvec2_mul( rvec2*, rvec2*, rvec2*, int );
+
+CUDA_GLOBAL void k_vector_mul( real*, real*, real*, int );
+
+CUDA_GLOBAL void k_norm_rvec2( const rvec2 *, rvec2 *, const size_t, int );
+
+CUDA_GLOBAL void k_dot_rvec2( const rvec2 *, rvec2 *, rvec2 *, const size_t );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca435269b8576ce74220ed442e75fb9f9a12819c
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -0,0 +1,101 @@
+
+#include "cuda_reset_tools.h"
+
+#include "cuda_list.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../reset_tools.h"
+
+
+extern "C"
+{
+
+void Cuda_Reset_Workspace( reax_system *system, storage *workspace )
+{
+    cuda_memset( dev_workspace->total_bond_order, 0,
+            system->total_cap * sizeof(real), "total_bond_order" );
+    cuda_memset( dev_workspace->dDeltap_self, 0,
+            system->total_cap * sizeof(rvec), "dDeltap_self" );
+    cuda_memset( dev_workspace->CdDelta, 0,
+            system->total_cap * sizeof(real), "CdDelta" );
+    cuda_memset( dev_workspace->f, 0,
+            system->total_cap * sizeof(rvec), "f" );
+}
+
+
+CUDA_GLOBAL void k_reset_hindex( reax_atom *my_atoms, single_body_parameters *sbp,
+        int * hindex, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    if ( sbp[ my_atoms[i].type ].p_hbond == H_ATOM ||
+      sbp[ my_atoms[i].type ].p_hbond == H_BONDING_ATOM )
+    {
+        hindex[i] = 1;
+    }
+    else
+    {
+        hindex[i] = 0;
+    }
+
+//    my_atoms[i].Hindex = hindex[i];
+    my_atoms[i].Hindex = i;
+}
+
+
+void Cuda_Reset_Atoms( reax_system* system, control_params *control )
+{
+    int blocks;
+    int *hindex;
+
+    hindex = (int *) scratch;
+    cuda_memset( scratch, 0, system->N * sizeof(int),
+           "Cuda_Reset_Atoms::scratch" );
+
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+    k_reset_hindex <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, hindex + 1, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( hindex, system->d_numH, system->N );
+
+    copy_host_device( &(system->numH), system->d_numH, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Reset_Atoms::d_numH" );
+
+    system->Hcap = MAX( system->numH * SAFER_ZONE, MIN_CAP );
+}
+
+
+void Cuda_Reset( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists )
+{
+    Cuda_Reset_Atoms( system, control );
+
+    Reset_Simulation_Data( data );
+
+    if ( control->virial )
+    {
+        Reset_Pressures( data );
+    }
+
+    Cuda_Reset_Workspace( system, workspace );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: reset done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+}
+
+
+}
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.h b/PG-PuReMD/src/cuda/cuda_reset_tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e90b8eb5d3e758d815de554181d86472560acd3
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.h
@@ -0,0 +1,27 @@
+
+#ifndef __CUDA_RESET_TOOLS_H__
+#define __CUDA_RESET_TOOLS_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void Cuda_Reset_Workspace( reax_system *, storage * );
+
+void Cuda_Reset_Atoms( reax_system *, control_params * );
+
+int  Cuda_Reset_Neighbor_Lists( reax_system *, control_params *,
+        storage *, reax_list ** );
+
+void Cuda_Reset( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list** );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_shuffle.h b/PG-PuReMD/src/cuda/cuda_shuffle.h
similarity index 94%
rename from PG-PuReMD/src/cuda_shuffle.h
rename to PG-PuReMD/src/cuda/cuda_shuffle.h
index 9d55b71689b6bccb6a77e9ee5a8383bc5462ea23..0d6872713b61f650523cfa71f97dc9ef17562931 100644
--- a/PG-PuReMD/src/cuda_shuffle.h
+++ b/PG-PuReMD/src/cuda/cuda_shuffle.h
@@ -22,8 +22,7 @@
 #ifndef __CUDA_SHUFFLE_H_
 #define __CUDA_SHUFFLE_H_
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
@@ -36,7 +35,7 @@ extern "C"  {
  * And the other is taken from the download in the PGPuReMD folder on CUPID
  * http://wenda.baba.io/questions/4481817/overloading-the-cuda-shuffle-function-makes-the-original-ones-invisible.html
  */
-CUDA_DEVICE inline real shfl(real x, int lane)
+CUDA_DEVICE static inline real shfl(real x, int lane)
 {
     // Split the double number into 2 32b registers.
     int lo, hi;
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f67bdaf327eb92860b7e28d49035bdc85ddd12bb
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -0,0 +1,1188 @@
+
+#include "cuda_system_props.h"
+
+#include "cuda_copy.h"
+#include "cuda_utils.h"
+#include "cuda_random.h"
+#include "cuda_reduction.h"
+#include "cuda_shuffle.h"
+#include "cuda_vector.h"
+
+#include "../tool_box.h"
+#include "../vector.h"
+
+
+CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec xcm[];
+    extern __shared__ rvec vcm[];
+    extern __shared__ rvec amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 *(blockDim.x);
+
+    unsigned int index = 0;
+    rvec tmp;
+    real m;
+
+    rvec_MakeZero(xcm[threadIdx.x]);
+    rvec_MakeZero(vcm[vcm_id + threadIdx.x]);
+    rvec_MakeZero(amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero(tmp);
+
+    if ( i < n )
+    {
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
+        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
+    }
+    __syncthreads( );
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 )
+    { 
+        if ((threadIdx.x < offset))
+        {
+            index = threadIdx.x + offset;
+            rvec_Add (xcm [threadIdx.x], xcm[index]);
+            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
+            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
+        } 
+        __syncthreads( );
+    }
+
+    if ((threadIdx.x == 0))
+    {
+        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void center_of_mass_blocks_xcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, size_t n )
+{
+    extern __shared__ rvec my_xcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int index = 0;
+    rvec xcm;
+    real m;
+
+    rvec_MakeZero (xcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm , m, atoms [i].x);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        xcm[0] += shfl( xcm[0], z);
+        xcm[1] += shfl( xcm[1], z);
+        xcm[2] += shfl( xcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_vcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_vcm, size_t n )
+{
+    extern __shared__ rvec my_vcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec vcm;
+    real m;
+
+    rvec_MakeZero (vcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (vcm , m, atoms [i].v);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        vcm[0] += shfl( vcm[0], z);
+        vcm[1] += shfl( vcm[1], z);
+        vcm[2] += shfl( vcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_amcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec my_amcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec amcm;
+    real m;
+    rvec tmp;
+
+    rvec_MakeZero (amcm);
+    rvec_MakeZero( tmp );
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm, m, tmp);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        amcm[0] += shfl( amcm[0], z);
+        amcm[1] += shfl( amcm[1], z);
+        amcm[2] += shfl( amcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
+    __syncthreads ();
+
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, 
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec sh_xcm[];
+    extern __shared__ rvec sh_vcm[];
+    extern __shared__ rvec sh_amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 * (blockDim.x);
+
+    unsigned int index = 0;
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec_MakeZero (t_xcm);
+    rvec_MakeZero (t_vcm);
+    rvec_MakeZero (t_amcm);
+
+    if (i < n){
+        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
+        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
+        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
+    }
+
+    rvec_Copy (sh_xcm[xcm_id], t_xcm);
+    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
+    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
+
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if (threadIdx.x < offset) {
+            index = threadIdx.x + offset;
+            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
+            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
+            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0){
+        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( single_body_parameters *sbp, 
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx[ xx_i ] = diff[0] * diff[0] * m;
+        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
+        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
+        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
+        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
+        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            xx[ threadIdx.x ] += xx[ index ];
+            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
+        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
+        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
+        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
+        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( real *input, real *output, size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n)
+    {
+        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
+        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
+        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
+        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
+        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
+        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset )
+        {
+            index = threadIdx.x + offset;
+            xx [ threadIdx.x ] += xx [ index ];
+            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0)
+    {
+        output[0] = xx[0];
+        output[1] = xy[xy_i];
+        output[2] = xz[xz_i];
+        output[3] = xz[yy_i];
+        output[4] = xz[yz_i];
+        output[5] = xz[zz_i];
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void compute_center_mass_xx_xy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xx[];
+    extern __shared__ real my_results_xy[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xx = 0;
+    real xy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx = diff[0] * diff[0] * m;
+        xy = diff[0] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xx += shfl( xx, z);
+        xy += shfl( xy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xx[threadIdx.x >> 5] = xx;    
+        my_results_xy[threadIdx.x >> 5] = xy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
+            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_xz_yy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xz[];
+    extern __shared__ real my_results_yy[];
+
+    unsigned int yy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xz = 0;
+    real yy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xz = diff[0] * diff[2] * m;
+        yy = diff[1] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xz += shfl( xz, z);
+        yy += shfl( yy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xz[threadIdx.x >> 5] = xz;    
+        my_results_yy[threadIdx.x >> 5] = yy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
+            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
+        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_yz_zz( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_yz[];
+    extern __shared__ real my_results_zz[];
+
+    unsigned int zz_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real yz = 0;
+    real zz = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n)
+    {
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        yz = diff[1] * diff[2] * m;
+        zz = diff[2] * diff[2] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        yz += shfl( yz, z);
+        zz += shfl( zz, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_yz[threadIdx.x >> 5] = yz;    
+        my_results_zz[threadIdx.x >> 5] = zz;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
+            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
+        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbp[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+    }
+    __syncthreads( );
+
+    for(int z = 16; z >=1; z/=2)
+    {
+        sdata += shfl( sdata, z);
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbp[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if(threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbp[0];
+    }
+
+#else
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if (i < n)
+    {
+        x = sbp[ my_atoms[i].type ].mass;
+    }
+
+    sdata[ threadIdx.x ] = x;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[ blockIdx.x] = sdata [0];
+    }
+
+#endif
+}
+
+
+extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
+{
+    real *block_mass = (real *) scratch;
+    cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
+
+    k_compute_total_mass <<< BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<< 1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        ( block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2 );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
+            cudaMemcpyDeviceToHost, "total_mass:tmp" );
+}
+
+
+CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbpdot[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, sdata, my_atoms[ i ].v );
+        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    __syncthreads( );
+
+    for(int z = 16; z >= 1; z /= 2)
+    {
+        sdata += shfl( sdata, z );
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbpdot[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbpdot[0];
+    }
+
+#else
+    extern __shared__ real sdata [];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    m = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        m = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, m, my_atoms[ i ].v );
+        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    sdata[ threadIdx.x ] = m;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = sdata[0];
+    }
+#endif
+}
+
+
+extern "C" void dev_compute_kinetic_energy( reax_system *system,
+        simulation_data *data, real *local_val )
+{
+    real *block_energy = (real *) scratch;
+    cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
+
+    k_compute_kinetic_energy <<< BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<< 1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        ( block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2 );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    //copy_host_device( local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
+    copy_host_device( local_val, block_energy + BLOCKS_POW_2,
+            sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
+    //copy_device( block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
+    //        sizeof(real), "kinetic_energy" );
+}
+
+
+extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, 
+        rvec vcm, rvec amcm )
+{
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    rvec *r_scratch = (rvec *)scratch;
+
+#if defined( __SM_35__)
+    // xcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_xcm = r_scratch;
+    
+    center_of_mass_blocks_xcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+            (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2,
+            sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" );
+    
+    // vcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_vcm = r_scratch;
+    
+    center_of_mass_blocks_vcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momentum:vcm" );
+    
+    // amcm
+    cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+    l_amcm = r_scratch;
+    
+    center_of_mass_blocks_amcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momemtum:amcm" );
+
+#else
+    cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    
+    l_xcm = r_scratch;
+    l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
+    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
+    
+    center_of_mass_blocks <<< BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( ); 
+    
+    center_of_mass <<< 1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
+        ( l_xcm, l_vcm, l_amcm, l_xcm + BLOCKS_POW_2, l_vcm + BLOCKS_POW_2,
+          l_amcm + BLOCKS_POW_2, BLOCKS_POW_2 );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( );
+    
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
+#endif
+}
+
+
+extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm )
+{
+#if defined(__SM_35__)
+    real *partial_results = (real *) scratch;
+    cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" );
+
+    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2,
+        sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" );
+
+#else
+    real *partial_results = (real *) scratch;
+    //real *local_results;
+
+    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
+    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
+
+    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
+            sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
+#endif
+}
+
+
+extern "C" void dev_sync_simulation_data( simulation_data *data )
+{
+    Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data );
+}
+
+
+CUDA_GLOBAL void k_generate_initial_velocities( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real T, int n )
+{
+    int i;
+    real m, scale, norm;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    if ( T <= 0.1 )
+    {
+        rvec_MakeZero( my_atoms[i].v );
+    }
+    else
+    {
+        cuda_rvec_Random( my_atoms[i].v );
+
+        norm = rvec_Norm_Sqr( my_atoms[i].v );
+        m = sbp[ my_atoms[i].type ].mass;
+        scale = SQRT( m * norm / (3.0 * K_B * T) );
+
+        rvec_Scale( my_atoms[i].v, 1. / scale, my_atoms[i].v );
+    }
+}
+
+
+void Cuda_Generate_Initial_Velocities( reax_system *system, real T )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    if ( T > 0.1 )
+    {
+        Cuda_Randomize( );
+    }
+
+    k_generate_initial_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, T, system->n );
+}
+
+
+void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
+        MPI_Comm comm )
+{
+    data->my_en.e_kin = 0.0;
+
+    dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
+
+    MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin,
+            1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->therm.T = (2.0 * data->sys_en.e_kin) / (data->N_f * K_B);
+
+    /* avoid T being an absolute zero, might cause F.P.E! */
+    if ( FABS(data->therm.T) < ALMOST_ZERO )
+    {
+        data->therm.T = ALMOST_ZERO;
+    }
+}
+
+
+void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
+        MPI_Comm comm  )
+{
+    real tmp;
+
+    /* compute local total mass of the system */
+    dev_compute_total_mass( system, &tmp );
+
+    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->inv_M = 1. / data->M;
+}
+
+
+void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
+        mpi_datatypes *mpi_data, MPI_Comm comm )
+{
+    int i;
+    real det; //xx, xy, xz, yy, yz, zz;
+    real tmp_mat[6], tot_mat[6];
+    rvec my_xcm, my_vcm, my_amcm, my_avcm;
+    rvec tvec;
+    rtensor mat, inv;
+
+    rvec_MakeZero( my_xcm );  // position of CoM
+    rvec_MakeZero( my_vcm );  // velocity of CoM
+    rvec_MakeZero( my_amcm ); // angular momentum of CoM
+    rvec_MakeZero( my_avcm ); // angular velocity of CoM
+
+    /* Compute the position, vel. and ang. momentum about the centre of mass */
+    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
+
+    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    for ( i = 0; i < 6; ++i )
+    {
+        tmp_mat[i] = 0;
+    }
+
+    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
+
+    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
+        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
+        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
+        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
+        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
+        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
+
+        /* invert the inertial tensor */
+        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
+                mat[0][1] * mat[1][2] * mat[2][0] +
+                mat[0][2] * mat[1][0] * mat[2][1] ) -
+              ( mat[0][0] * mat[1][2] * mat[2][1] +
+                mat[0][1] * mat[1][0] * mat[2][2] +
+                mat[0][2] * mat[1][1] * mat[2][0] );
+
+        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+        if ( det > ALMOST_ZERO )
+        {
+            rtensor_Scale( inv, 1. / det, inv );
+        }
+        else
+        {
+            rtensor_MakeZero( inv );
+        }
+
+        /* Compute the angular velocity about the centre of mass */
+        rtensor_MatVec( data->avcm, inv, data->amcm );
+    }
+
+    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
+
+    /* Compute the rotational energy */
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+
+#if defined(DEBUG)
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
+             data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
+             data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
+             data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2],
+       mat[1][0], mat[1][1], mat[1][2],
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2],
+       inv[1][0], inv[1][1], inv[1][2],
+       inv[2][0], inv[2][1], inv[2][2] ); */
+    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
+             data->avcm[0], data->avcm[1], data->avcm[2] );
+#endif
+}
+
+
+CUDA_GLOBAL void k_compute_pressure( reax_atom *my_atoms, simulation_box *big_box,
+        rvec *int_press, int n )
+{
+    reax_atom *p_atom;
+    rvec tx;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    p_atom = &( my_atoms[i] );
+    rvec_MakeZero( int_press[i] );
+
+    /* transform x into unit box coordinates, store in tx */
+    Transform_to_UnitBox( p_atom->x, big_box, 1, tx );
+
+    /* this atom's contribution to internal pressure */
+    rvec_Multiply( int_press[i], p_atom->f, tx );
+}
+
+
+/* IMPORTANT: This function assumes that current kinetic energy
+ * the system is already computed
+ *
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+ *  to be added when there are long-range interactions or long-range
+ *  corrections to short-range interactions present.
+ *  We may want to add that for more accuracy.
+ */
+void Cuda_Compute_Pressure( reax_system* system, control_params *control,
+        simulation_data* data, mpi_datatypes *mpi_data )
+{
+    int blocks, block_size, blocks_n, blocks_pow_2_n;
+    rvec *rvec_spad;
+    rvec int_press;
+    simulation_box *big_box;
+    
+    rvec_spad = (rvec *) scratch;
+    big_box = &(system->big_box);
+
+    /* 0: both int and ext, 1: ext only, 2: int only */
+    if ( control->press_mode == 0 || control->press_mode == 2 )
+    {
+        blocks = system->n / DEF_BLOCK_SIZE + 
+            ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+        compute_blocks( &blocks_n, &block_size, system->n );
+        compute_nearest_pow_2( blocks_n, &blocks_pow_2_n );
+
+        k_compute_pressure <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->d_big_box, rvec_spad,
+              system->n );
+
+        k_reduction_rvec <<< blocks_n, block_size, sizeof(rvec) * block_size >>>
+            ( rvec_spad, rvec_spad + system->n,  system->n );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        k_reduction_rvec <<< 1, blocks_pow_2_n, sizeof(rvec) * blocks_pow_2_n >>>
+            ( rvec_spad + system->n, rvec_spad + system->n + blocks_n, blocks_n );
+        cudaThreadSynchronize ();
+        cudaCheckError( );
+
+        copy_host_device( &int_press, rvec_spad + system->n + blocks_n, sizeof(rvec), 
+                cudaMemcpyDeviceToHost, "Cuda_Compute_Pressure::d_int_press" );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d:p_int(%10.5f %10.5f %10.5f)p_ext(%10.5f %10.5f %10.5f)\n",
+            system->my_rank, int_press[0], int_press[1], int_press[2],
+            data->my_ext_press[0], data->my_ext_press[1], data->my_ext_press[2] );
+#endif
+
+    /* sum up internal and external pressure */
+    MPI_Allreduce( int_press, data->int_press,
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+    MPI_Allreduce( data->my_ext_press, data->ext_press,
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
+             system->my_rank,
+             data->int_press[0], data->int_press[1], data->int_press[2] );
+    fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
+             system->my_rank,
+             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+#endif
+
+    /* kinetic contribution */
+    data->kin_press = 2.0 * (E_CONV * data->sys_en.e_kin)
+        / (3.0 * big_box->V * P_CONV);
+
+    /* Calculate total pressure in each direction */
+    data->tot_press[0] = data->kin_press -
+        (( data->int_press[0] + data->ext_press[0] ) /
+         ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
+
+    data->tot_press[1] = data->kin_press -
+        (( data->int_press[1] + data->ext_press[1] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
+
+    data->tot_press[2] = data->kin_press -
+        (( data->int_press[2] + data->ext_press[2] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
+
+    /* Average pressure for the whole box */
+    data->iso_bar.P =
+        ( data->tot_press[0] + data->tot_press[1] + data->tot_press[2] ) / 3.0;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h
new file mode 100644
index 0000000000000000000000000000000000000000..9877f6dd4b8b5f26beb5aa00b804cf2aeefaa609
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_system_props.h
@@ -0,0 +1,41 @@
+
+#ifndef __CUDA_SYSTEM_PROPS_H__
+#define __CUDA_SYSTEM_PROPS_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void dev_compute_total_mass( reax_system *, real * );
+
+void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
+
+void dev_compute_momentum( reax_system *, rvec, rvec, rvec );
+
+void dev_compute_inertial_tensor( reax_system *, real *, rvec my_xcm );
+
+void dev_sync_simulation_data( simulation_data * );
+
+//void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
+
+void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Generate_Initial_Velocities( reax_system *, real );
+
+void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
+        mpi_datatypes*, MPI_Comm );
+
+void Cuda_Compute_Pressure( reax_system *, control_params *,
+        simulation_data *, mpi_datatypes * );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
similarity index 89%
rename from PG-PuReMD/src/cuda_torsion_angles.cu
rename to PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index e9a9b1f01123e38d97b7d2774e5dce16a55ede1c..21cb33d69f63635deda1519f0f320ced49730439 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -19,18 +19,19 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-#include "index_utils.h"
 #include "cuda_torsion_angles.h"
-#include "vector.h"
-#include "dev_list.h"
+
+#include "cuda_list.h"
 #include "cuda_helpers.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
+
 #define MIN_SINE 1e-10
 
+
 CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
-        rvec dvec_jk, real r_jk,
-        rvec dvec_kl, real r_kl,
+        rvec dvec_jk, real r_jk, rvec dvec_kl, real r_kl,
         rvec dvec_li, real r_li,
         three_body_interaction_data *p_ijk, 
         three_body_interaction_data *p_jkl, 
@@ -56,8 +57,7 @@ CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
 
-    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
-
+    omega = ATAN2( unnorm_sin_omega, unnorm_cos_omega ); 
 
     /* derivatives */
     /* coef for adjusments to cos_theta's */
@@ -76,16 +76,24 @@ CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
 
 
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 )
+    {
+        poem = 1e-20;
+    }
 
     tel  = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) - 
         2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
                 r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if( arg >  1.0 ) arg =  1.0;
-    if( arg < -1.0 ) arg = -1.0;
-
+    if ( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if ( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
 
     /* fprintf( out_control->etor, 
        "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
@@ -107,10 +115,22 @@ CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
        -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl, 
        -p_jkl->dcos_dk[2]/sin_jkl );*/
 
-    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    {
+        sin_ijk = MIN_SINE;
+    }
+    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    {
+        sin_ijk = -MIN_SINE;
+    }
+    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    {
+        sin_jkl = MIN_SINE;
+    }
+    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    {
+        sin_jkl = -MIN_SINE;
+    }
 
     // dcos_omega_di
     rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
@@ -140,20 +160,14 @@ CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
 }
 
 
-
-CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, 
-        global_parameters gp, 
-        four_body_header *d_fbp, 
-        control_params *control, 
-        reax_list p_bonds, reax_list p_thb_intrs, 
-        storage p_workspace, 
-        int n, int num_atom_types, 
-        real *data_e_tor, real *data_e_con, 
-        rvec *data_ext_press )
+CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, 
+        four_body_header *d_fbp, control_params *control, reax_list p_bonds,
+        reax_list p_thb_intrs, storage p_workspace, int n, int num_atom_types, 
+        real *data_e_tor, real *data_e_con, rvec *data_ext_press )
 {
-    int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
+    int i, j, k, l, pi, pj, pk, pl, pij, plk;
     int type_i, type_j, type_k, type_l;
-    int start_j, end_j, start_k, end_k;
+    int start_j, end_j;
     int start_pj, end_pj, start_pk, end_pk;
     int num_frb_intrs = 0;
 
@@ -203,15 +217,13 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
     // sprintf( fname, "tor%d.out", system->my_rank );
     // ftor = fopen( fname, "w" );
 
-    //natoms = system->n;
-
-    //for( j = 0; j < natoms; ++j ) {
     type_j = my_atoms[j].type;
     Delta_j = workspace->Delta_boc[j];
     start_j = Dev_Start_Index(j, bonds);
     end_j = Dev_End_Index(j, bonds);
 
-    for( pk = start_j; pk < end_j; ++pk ) {
+    for ( pk = start_j; pk < end_j; ++pk )
+    {
         pbond_jk = &( bonds->select.bond_list[pk] );
         k = pbond_jk->nbr;
         bo_jk = &( pbond_jk->bo_data );
@@ -220,16 +232,16 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
         /* see if there are any 3-body interactions involving j&k
            where j is the central atom. Otherwise there is no point in
            trying to form a 4-body interaction out of this neighborhood */
-        if( my_atoms[j].orig_id < my_atoms[k].orig_id && 
-                bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) {
-            start_k = Dev_Start_Index(k, bonds);
-            end_k = Dev_End_Index(k, bonds);               
+        if ( my_atoms[j].orig_id < my_atoms[k].orig_id && 
+                bo_jk->BO > control->thb_cut && Dev_Num_Entries(pk, thb_intrs) )
+        {
             pj = pbond_jk->sym_index; // pj points to j on k's list
 
             /* do the same check as above: 
                are there any 3-body interactions involving k&j 
                where k is the central atom */
-            if( Dev_Num_Entries(pj, thb_intrs) ) {
+            if ( Dev_Num_Entries(pj, thb_intrs) )
+            {
                 type_k = my_atoms[k].type;
                 Delta_k = workspace->Delta_boc[k];
                 r_jk = pbond_jk->d;
@@ -246,16 +258,17 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                 exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
                 f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
 
-
                 /* pick i up from j-k interaction where j is the central atom */
-                for( pi = start_pk; pi < end_pk; ++pi ) {
+                for ( pi = start_pk; pi < end_pk; ++pi )
+                {
                     p_ijk = &( thb_intrs->select.three_body_list[pi] );
                     pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
                     pbond_ij = &( bonds->select.bond_list[pij] );
                     bo_ij = &( pbond_ij->bo_data );
 
 
-                    if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                    if ( bo_ij->BO > control->thb_cut )
+                    {
                         i = p_ijk->thb;
                         type_i = my_atoms[i].type;
                         r_ij = pbond_ij->d;
@@ -266,17 +279,24 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                         cos_ijk = COS( theta_ijk );
                         //tan_ijk_i = 1. / TAN( theta_ijk );
                         if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                        {
                             tan_ijk_i = cos_ijk / MIN_SINE;
+                        }
                         else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                        {
                             tan_ijk_i = cos_ijk / -MIN_SINE;
-                        else tan_ijk_i = cos_ijk / sin_ijk;
+                        }
+                        else
+                        {
+                            tan_ijk_i = cos_ijk / sin_ijk;
+                        }
 
                         exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
                         exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
 
 
                         /* pick l up from j-k interaction where k is the central atom */
-                        for( pl = start_pj; pl < end_pj; ++pl ) {
+                        for ( pl = start_pj; pl < end_pj; ++pl ) {
                             p_jkl = &( thb_intrs->select.three_body_list[pl] );
                             l = p_jkl->thb;
                             plk = p_jkl->pthb; //pointer to l on k's bond_list!
@@ -286,10 +306,10 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                             fbh = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)]);
                             fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
 
-
-                            if( i != l && fbh->cnt && 
-                                    bo_kl->BO > control->thb_cut/*0*/ &&
-                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                            if ( i != l && fbh->cnt && 
+                                    bo_kl->BO > control->thb_cut &&
+                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut )
+                            {
                                 ++num_frb_intrs;
                                 r_kl = pbond_kl->d;
                                 BOA_kl = bo_kl->BO - control->thb_cut;
@@ -298,26 +318,29 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                                 sin_jkl = SIN( theta_jkl );
                                 cos_jkl = COS( theta_jkl );
                                 //tan_jkl_i = 1. / TAN( theta_jkl );
-                                if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                {
                                     tan_jkl_i = cos_jkl / MIN_SINE;
-                                else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                }
+                                else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                {
                                     tan_jkl_i = cos_jkl / -MIN_SINE;
-                                else tan_jkl_i = cos_jkl /sin_jkl;
+                                }
+                                else
+                                {
+                                    tan_jkl_i = cos_jkl /sin_jkl;
+                                }
 
                                 rvec_ScaledSum( dvec_li, 1., my_atoms[i].x, 
                                         -1., my_atoms[l].x );
                                 r_li = rvec_Norm( dvec_li );                 
 
-
                                 /* omega and its derivative */
-                                omega = Calculate_Omega( pbond_ij->dvec, r_ij, 
-                                        pbond_jk->dvec, r_jk, 
-                                        pbond_kl->dvec, r_kl,
-                                        dvec_li, r_li,
-                                        p_ijk, p_jkl,
+                                omega = Calculate_Omega( pbond_ij->dvec, r_ij,
+                                        pbond_jk->dvec, r_jk, pbond_kl->dvec, r_kl,
+                                        dvec_li, r_li, p_ijk, p_jkl,
                                         dcos_omega_di, dcos_omega_dj,
-                                        dcos_omega_dk, dcos_omega_dl,
-                                        NULL);
+                                        dcos_omega_dk, dcos_omega_dl, NULL );
 
                                 cos_omega = COS( omega );
                                 cos2omega = COS( 2. * omega );
@@ -336,7 +359,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                                         fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
                                         fbp->V3 * (1.0 + cos3omega) );
 
-                                data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                data_e_tor[j] += e_tor;
 
                                 dfn11 = (-p_tor3 * exp_tor3_DjDk +
                                         (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
@@ -369,9 +393,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
 
                                 /* 4-body conjugation energy */
                                 fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                data_e_con [j] += e_con =
-                                    fbp->p_cot1 * fn12 * 
+                                e_con = fbp->p_cot1 * fn12 * 
                                     (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+                                data_e_con[j] += e_con;
 
                                 Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
                                     (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
@@ -402,9 +426,10 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                                 pbond_jk->ta_CdDelta += CEtors3;
                                 bo_ij->Cdbo += (CEtors4 + CEconj1);
                                 bo_jk->Cdbo += (CEtors5 + CEconj2);
-                                atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3));
+                                myatomicAdd( &pbond_kl->ta_Cdbo, CEtors6 + CEconj3 );
 
-                                if( control->virial == 0 ) {
+                                if ( control->virial == 0 )
+                                {
                                     /* dcos_theta_ijk */
                                     //rvec_ScaledAdd( workspace->f[i], 
                                     atomic_rvecScaledAdd( pbond_ij->ta_f, 
@@ -438,7 +463,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                                     atomic_rvecScaledAdd( pbond_kl->ta_f,
                                             CEtors9 + CEconj6, dcos_omega_dl );
                                 }
-                                else {
+                                else
+                                {
                                     ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
 
                                     /* dcos_theta_ijk */
@@ -605,33 +631,37 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
 }
 
 CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *my_atoms, 
-        storage p_workspace, 
-        reax_list p_bonds, int N )
+        storage p_workspace, reax_list p_bonds, int N )
 {
     int i, pj;
-
     bond_data *pbond;
     bond_data *sym_index_bond;
     bond_order_data *bo_data;
-
-    reax_list *bonds = &p_bonds;
-    storage *workspace = &p_workspace;
+    reax_list *bonds;
+    storage *workspace;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N) return;
 
-    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+    if ( i >= N )
+    {
+        return;
+    }
+
+    bonds = &p_bonds;
+    workspace = &p_workspace;
 
+    for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
         pbond = &(bonds->select.bond_list[pj]);
         bo_data = &pbond->bo_data;
         sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
 
-        workspace->CdDelta [i] += sym_index_bond->ta_CdDelta;
+        workspace->CdDelta[i] += sym_index_bond->ta_CdDelta;
 
         bo_data->Cdbo += pbond->ta_Cdbo;
 
-        //update f vector
-        //rvec_Add (my_atoms [i].f, sym_index_bond->ta_f ); 
+        /* update f vector */
+//        rvec_Add( my_atoms [i].f, sym_index_bond->ta_f ); 
         rvec_Add (workspace->f[i], sym_index_bond->ta_f ); 
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7d9c3cb3f21203e02d7bdc52a1c6bb8c6887134
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
@@ -0,0 +1,36 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_TORSION_ANGLES_H_
+#define __CUDA_TORSION_ANGLES_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, global_parameters,
+        four_body_header *, control_params *, reax_list, reax_list,
+        storage, int, int, real *, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22ac8de66c574534d2a170270d3050fda8e24f7c
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_utils.cu
@@ -0,0 +1,168 @@
+#include "cuda_utils.h"
+
+
+extern "C" void cuda_malloc( void **ptr, size_t size, int mem_set, const char *msg )
+{
+
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMalloc( ptr, size );
+
+    if ( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "[ERROR] failed to allocate memory on device for resouce %s\n", msg );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
+                retVal, size );
+        exit( INSUFFICIENT_MEMORY );
+    }  
+
+    if ( mem_set == TRUE )
+    {
+        retVal = cudaMemset( *ptr, 0, size );
+
+        if( retVal != cudaSuccess )
+        {
+            fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg );
+            fprintf( stderr, "    [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
+                    retVal, size );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }  
+}
+
+
+extern "C" void cuda_free( void *ptr, const char *msg )
+{
+
+    cudaError_t retVal = cudaSuccess;
+
+    if ( !ptr )
+    {
+        return;
+    }  
+
+    retVal = cudaFree( ptr );
+
+    if( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "[WARNING] failed to release memory on device for resource %s\n",
+                msg );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d, memory address: %ld\n", 
+                retVal, (long int) ptr );
+        return;
+    }  
+}
+
+
+extern "C" void cuda_memset( void *ptr, int data, size_t count, const char *msg )
+{
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMemset( ptr, data, count );
+
+    if( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d\n", retVal );
+        exit( RUNTIME_ERROR );
+    }
+}
+
+
+extern "C" void copy_host_device( void *host, void *dev, size_t size,
+        cudaMemcpyKind dir, const char *msg )
+{
+    cudaError_t retVal = cudaErrorNotReady;
+
+    if( dir == cudaMemcpyHostToDevice )
+    {
+        retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice );
+    }
+    else
+    {
+        retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost );
+    }
+
+    if( retVal != cudaSuccess )
+    {
+        fprintf( stderr,
+                "[ERROR] could not copy resource %s from host to device\n    [INFO] CUDA API error code: %d n",
+                msg, retVal );
+        exit( INSUFFICIENT_MEMORY );
+    }
+}
+
+
+extern "C" void copy_device( void *dest, void *src, size_t size, const char *msg )
+{
+    cudaError_t retVal;
+
+    retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
+
+    if( retVal != cudaSuccess )
+    {
+        fprintf( stderr,
+                "[ERROR] could not copy resource %s from device to device\n    [INFO] CUDA API error code: %d\n",
+                msg, retVal );
+        exit( INSUFFICIENT_MEMORY );
+    }
+}
+
+
+extern "C" void compute_blocks( int *blocks, int *block_size, int count )
+{
+    *block_size = CUDA_BLOCK_SIZE;
+    *blocks = (int) CEIL((double) count / CUDA_BLOCK_SIZE);
+}
+
+
+extern "C" void compute_matvec_blocks( int *blocks, int count )
+{
+
+    *blocks = (int) CEIL((double) count * MATVEC_KER_THREADS_PER_ROW / MATVEC_BLOCK_SIZE);
+}
+
+
+extern "C" void compute_nearest_pow_2( int blocks, int *result )
+{
+
+  *result = (int) EXP2( CEIL( LOG2((double) blocks) ) );
+}
+
+
+extern "C" void print_device_mem_usage( )
+{
+    size_t total, free;
+    cudaError_t retVal;
+
+    retVal = cudaMemGetInfo( &free, &total );
+
+    if ( retVal != cudaSuccess )
+    {
+        fprintf( stderr,
+                "[WARNING] could not get message usage info from device\n    [INFO] CUDA API error code: %d\n",
+                retVal );
+        return;
+    }
+
+    fprintf( stderr, "Total: %zu bytes (%7.2f MB)\nFree %zu bytes (%7.2f MB)\n", 
+            total, (long long int)total/(1024.0*1024.0),
+            free, (long long int)free/(1024.0*1024.0) );
+}
+
+
+extern "C" void init_blocks( reax_system *system )
+{
+    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
+    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
+
+    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
+    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
+
+    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
+
+#if defined(__CUDA_DEBUG_LOG__)
+    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
+            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
+#endif
+}
diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4e49a10b65429f0910f5e374a2f15790ff0c89b
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -0,0 +1,64 @@
+#ifndef __CUDA_UTILS_H_
+#define __CUDA_UTILS_H_
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void cuda_malloc( void **, size_t, int, const char * );
+
+void cuda_free( void *, const char * );
+
+void cuda_memset( void *, int , size_t , const char * );
+
+void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * );
+
+void copy_device( void *, void *, size_t, const char * );
+
+void compute_blocks( int *, int *, int );
+
+void compute_matvec_blocks( int *, int );
+
+void compute_nearest_pow_2( int, int * );
+
+void init_blocks( reax_system * );
+
+void print_device_mem_usage( );
+
+
+#ifdef __cplusplus
+#define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
+static inline void __cudaCheckError( const char *file, const int line )
+{
+    cudaError err;
+
+    err = cudaGetLastError();
+    if ( cudaSuccess != err )
+    {
+        fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d\n", err );
+        exit( RUNTIME_ERROR );
+    }
+
+#if defined(DEBUG_FOCUS)
+    /* More careful checking. However, this will affect performance. */
+    err = cudaDeviceSynchronize( );
+    if( cudaSuccess != err )
+    {
+       exit( RUNTIME_ERROR );
+    }
+#endif
+
+    return;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
new file mode 100644
index 0000000000000000000000000000000000000000..35e07fe011f482e0a6b60a944e4836e50ecf0d88
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -0,0 +1,619 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_valence_angles.h"
+
+#include "cuda_list.h"
+
+#include "../index_utils.h"
+#include "../vector.h"
+
+
+/* Compute 3-body interactions, in which the main role is played by
+   atom j, which sits in the middle of the other two atoms i and k. */
+CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
+        global_parameters gp, single_body_parameters *sbp, three_body_header *d_thbh,
+        control_params *control, storage p_workspace, reax_list p_bonds,
+        reax_list p_thb_intrs, int n, int N, int num_atom_types,
+        real *data_e_ang, real *data_e_pen, real *data_e_coa, rvec *my_ext_press )
+{
+    int i, j, pi, k, pk, t;
+    int type_i, type_j, type_k;
+    int start_j, end_j;
+//    int start_pk, end_pk;
+    int cnt, num_thb_intrs;
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real BOA_ij, BOA_jk;
+    rvec force, ext_press;
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk;
+//    three_body_interaction_data *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    reax_list *bonds;
+    reax_list *thb_intrs;
+    storage *workspace;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= N )
+    {
+        return;
+    }
+
+    bonds = &( p_bonds );
+    thb_intrs =  &( p_thb_intrs );
+    workspace = &( p_workspace );
+    /* global parameters used in these calculations */
+    p_val6 = gp.l[14];
+    p_val8 = gp.l[33];
+    p_val9 = gp.l[16];
+    p_val10 = gp.l[17];
+    //num_thb_intrs = j * THREE_BODY_OFFSET;
+    type_j = my_atoms[j].type;
+    start_j = Dev_Start_Index( j, bonds );
+    end_j = Dev_End_Index( j, bonds );
+    p_val3 = sbp[ type_j ].p_val3;
+    p_val5 = sbp[ type_j ].p_val5;
+    SBOp = 0.0;
+    prod_SBO = 1.0;
+
+    for( t = start_j; t < end_j; ++t )
+    {
+        bo_jt = &(bonds->select.bond_list[t].bo_data);
+        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        temp = SQR( bo_jt->BO );
+        temp *= temp;
+        temp *= temp;
+        prod_SBO *= EXP( -temp );
+    }
+
+    /* modifications to match Adri's code - 09/01/09 */
+    if( workspace->vlpex[j] >= 0 )
+    {
+        vlpadj = 0;
+        dSBO2 = prod_SBO - 1;
+    }
+    else
+    {
+        vlpadj = workspace->nlp[j];
+        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+    }
+
+    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+
+    if( SBO <= 0 )
+    {
+        SBO2 = 0;
+        CSBO2 = 0;
+    }
+    else if( SBO > 0 && SBO <= 1 )
+    {
+        SBO2 = POW( SBO, p_val9 );
+        CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+    }
+    else if( SBO > 1 && SBO < 2 )
+    {
+        SBO2 = 2 - POW( 2-SBO, p_val9 );
+        CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+    }
+    else
+    {
+        SBO2 = 2;
+        CSBO2 = 0;
+    }
+
+    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+
+    for( pi = start_j; pi < end_j; ++pi )
+    {
+        num_thb_intrs = Dev_Start_Index( pi, thb_intrs );
+
+        pbond_ij = &(bonds->select.bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+        if ( BOA_ij > 0.0 &&
+                ( j < n || pbond_ij->nbr < n ) )
+        {
+            i = pbond_ij->nbr;
+            type_i = my_atoms[i].type;
+
+            /* first copy 3-body intrs from previously computed ones where i > k;
+               in the second for-loop below, compute only new 3-body intrs where i < k */
+
+            // The copy loop commented out because strange asynchronous issues started to surface
+            // Each kernel now manually generates everything
+//            for( pk = start_j; pk < pi; ++pk )
+//            {
+//                start_pk = Dev_Start_Index( pk, thb_intrs );
+//                end_pk = Dev_End_Index( pk, thb_intrs );
+//
+//                for( t = start_pk; t < end_pk; ++t )
+//                {
+//                    if( thb_intrs->select.three_body_list[t].thb == i )
+//                    {
+//                        p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
+//                        p_kji = &(thb_intrs->select.three_body_list[t]);
+//
+//                        p_ijk->thb = bonds->select.bond_list[pk].nbr;
+//                        p_ijk->pthb  = pk;
+//                        p_ijk->theta = p_kji->theta;
+//                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+//                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+//                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+//
+//                        ++num_thb_intrs;
+//                        break;
+//                    }
+//                }
+//            }
+
+            /* and this is the second for loop mentioned above */
+            //for( pk = pi+1; pk < end_j; ++pk ) {
+            // Except that now the loop goes all the way from start_j to end_j
+            for( pk = start_j; pk < end_j; ++pk )
+            {
+                if ( pk == pi )
+                {
+                    continue;
+                }
+
+                pbond_jk = &(bonds->select.bond_list[pk]);
+                bo_jk = &(pbond_jk->bo_data);
+                BOA_jk = bo_jk->BO - control->thb_cut;
+                k = pbond_jk->nbr;
+                type_k = my_atoms[k].type;
+                p_ijk = &( thb_intrs->select.three_body_list[num_thb_intrs] );
+
+                //CHANGE ORIGINAL
+                //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
+                if ( BOA_jk <= 0.0 )
+                {
+                    continue;
+                }
+                //CHANGE ORIGINAL
+
+                Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
+                        pbond_jk->dvec, pbond_jk->d,
+                        &theta, &cos_theta );
+
+                Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
+                        pbond_jk->dvec, pbond_jk->d,
+                        &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                        &(p_ijk->dcos_dk) );
+
+                p_ijk->thb = k;
+                p_ijk->pthb = pk;
+                p_ijk->theta = theta;
+                sin_theta = SIN( theta );
+
+                if ( sin_theta < 1.0e-5 )
+                {
+                    sin_theta = 1.0e-5;
+                }
+
+                ++num_thb_intrs;
+
+                if ( j < n && BOA_jk > 0.0 &&
+                        bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) )
+                {
+                    thbh = &( d_thbh[ index_thbp(type_i, type_j, type_k, num_atom_types) ] );
+
+                    for ( cnt = 0; cnt < thbh->cnt; ++cnt )
+                    {
+                        if ( FABS(thbh->prm[cnt].p_val1) > 0.001 )
+                        {
+                            thbp = &( thbh->prm[cnt] );
+
+                            /* ANGLE ENERGY */
+                            p_val1 = thbp->p_val1;
+                            p_val2 = thbp->p_val2;
+                            p_val4 = thbp->p_val4;
+                            p_val7 = thbp->p_val7;
+                            theta_00 = thbp->theta_00;
+
+                            exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                            f7_ij = 1.0 - exp3ij;
+                            Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+
+                            exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                            f7_jk = 1.0 - exp3jk;
+                            Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+
+                            expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                            trm8 = 1.0 + expval6 + expval7;
+                            f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                            Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                ( p_val6 * expval6 * trm8 -
+                                  (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) );
+
+                            theta_0 = 180.0 - theta_00 * (1.0 -
+                                    EXP(-p_val10 * (2.0 - SBO2)));
+                            theta_0 = DEG2RAD( theta_0 );
+
+                            expval2theta  = EXP( -p_val2 * SQR(theta_0 - theta) );
+                            if ( p_val1 >= 0 )
+                            {
+                                expval12theta = p_val1 * (1.0 - expval2theta);
+                            }
+                            /* to avoid linear Me-H-Me angles (6/6/06) */
+                            else
+                            {
+                                expval12theta = p_val1 * -expval2theta;
+                            }
+
+                            CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                            CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                            CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                            CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
+                                expval2theta * (theta_0 - theta);
+
+                            Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
+                                exp( -p_val10 * (2.0 - SBO2) );
+
+                            CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                            CEval6 = CEval5 * dSBO1;
+                            CEval7 = CEval5 * dSBO2;
+                            CEval8 = -CEval4 / sin_theta;
+
+                            if ( pk < pi )
+                            {
+                                e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                                data_e_ang[j] += e_ang;
+
+                            }
+                            /* END ANGLE ENERGY*/
+
+                            /* PENALTY ENERGY */
+                            p_pen1 = thbp->p_pen1;
+                            p_pen2 = gp.l[19];
+                            p_pen3 = gp.l[20];
+                            p_pen4 = gp.l[21];
+
+                            exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                            exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                            exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                            exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                            trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                            f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                            Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - (2.0 + exp_pen3)
+                                    * ( -p_pen3 * exp_pen3 + p_pen4 * exp_pen4 ) )
+                                / SQR( trm_pen34 );
+
+                            /* very important: since each kernel generates all interactions,
+                               need to prevent all energies becoming duplicates */
+                            if ( pk < pi )
+                            {
+                                e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                data_e_pen[j] += e_pen;
+                            }
+
+                            CEpen1 = e_pen * Cf9j / f9_Dj;
+                            temp = -2.0 * p_pen2 * e_pen;
+                            CEpen2 = temp * (BOA_ij - 2.0);
+                            CEpen3 = temp * (BOA_jk - 2.0);
+                            /* END PENALTY ENERGY */
+
+                            /* COALITION ENERGY */
+                            p_coa1 = thbp->p_coa1;
+                            p_coa2 = gp.l[2];
+                            p_coa3 = gp.l[38];
+                            p_coa4 = gp.l[30];
+
+                            exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+
+                            /* similar to above comment regarding if statement */
+                            if ( pk < pi )
+                            {
+                                e_coa =
+                                    p_coa1 / (1. + exp_coa2) *
+                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[i] - BOA_ij) ) *
+                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[k] - BOA_jk) ) *
+                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                data_e_coa[j] += e_coa;
+                            }
+
+                            CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                            CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                            CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                            CEcoa4 = -2 * p_coa3 *
+                                (workspace->total_bond_order[i] - BOA_ij) * e_coa;
+                            CEcoa5 = -2 * p_coa3 *
+                                (workspace->total_bond_order[k] - BOA_jk) * e_coa;
+                            /* END COALITION ENERGY */
+
+                            /* FORCES */
+                            // we must again check for pk<pi for entire forces part
+                            if ( pk < pi )
+                            {
+                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                                workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+//                                workspace->CdDelta[i] += CEcoa4;
+//                                workspace->CdDelta[k] += CEcoa5;
+                                pbond_ij->va_CdDelta += CEcoa4;
+                                pbond_jk->va_CdDelta += CEcoa5;
+
+                                for ( t = start_j; t < end_j; ++t )
+                                {
+                                    pbond_jt = &( bonds->select.bond_list[t] );
+                                    bo_jt = &(pbond_jt->bo_data);
+                                    temp_bo_jt = bo_jt->BO;
+                                    temp = CUBE( temp_bo_jt );
+                                    pBOjt7 = temp * temp * temp_bo_jt;
+
+                                    bo_jt->Cdbo += (CEval6 * pBOjt7);
+                                    bo_jt->Cdbopi += CEval5;
+                                    bo_jt->Cdbopi2 += CEval5;
+                                }
+
+                                if ( control->virial == 0 )
+                                {
+//                                    rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
+                                    rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
+                                    rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+//                                    rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
+                                    rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
+                                }
+                                else
+                                {
+                                    /* terms not related to bond order derivatives are
+                                       added directly into forces and pressure vector/tensor */
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
+//                                    rvec_Add( workspace->f[i], force );
+                                    rvec_Add( pbond_ij->va_f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+//                                    rvec_Add( data->my_ext_press, ext_press );
+                                    rvec_Add( my_ext_press[j], ext_press );
+
+                                    rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+//                                    rvec_Add( workspace->f[k], force );
+                                    rvec_Add( pbond_jk->va_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( my_ext_press[j], ext_press );
+                                }
+                            }
+
+#ifdef TEST_ENERGY
+                            /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
+                              p_val3, p_val4, BOA_ij, BOA_jk );
+                              fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n",
+                              workspace->Delta_e[j], workspace->vlpex[j],
+                              dSBO1, dSBO2, vlpadj );
+                              fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
+                              f7_ij, f7_jk, f8_Dj, expval12theta );
+                              fprintf( out_control->eval,
+                              "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                              CEval1, CEval2, CEval3, CEval4,
+                              CEval5, CEval6, CEval7, CEval8 );
+
+                              fprintf( out_control->eval,
+                              "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                              p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta,
+                              p_ijk->dcos_di[2]/sin_theta,
+                              p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta,
+                              p_ijk->dcos_dj[2]/sin_theta,
+                              p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta,
+                              p_ijk->dcos_dk[2]/sin_theta);
+
+                              fprintf( out_control->eval,
+                              "%6d%6d%6d%15.8f%15.8f\n",
+                              system->my_atoms[i].orig_id,
+                              system->my_atoms[j].orig_id,
+                              system->my_atoms[k].orig_id,
+                              RAD2DEG(theta), e_ang );*/
+
+                            fprintf( out_control->eval,
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id,
+                                    system->my_atoms[j].orig_id,
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), theta_0, BOA_ij, BOA_jk,
+                                    e_ang, data->my_en.e_ang );
+
+                            fprintf( out_control->epen,
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id,
+                                    system->my_atoms[j].orig_id,
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), BOA_ij, BOA_jk, e_pen,
+                                    data->my_en.e_pen );
+
+                            fprintf( out_control->ecoa,
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id,
+                                    system->my_atoms[j].orig_id,
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), BOA_ij, BOA_jk,
+                                    e_coa, data->my_en.e_coa );
+#endif
+
+#ifdef TEST_FORCES
+                            /* angle forces */
+                            Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                            Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                            Add_dDelta( system, lists, j,
+                                    CEval3 + CEval7, workspace->f_ang );
+
+                            for( t = start_j; t < end_j; ++t )
+                            {
+                                pbond_jt = &( bonds->select.bond_list[t] );
+                                bo_jt = &(pbond_jt->bo_data);
+                                temp_bo_jt = bo_jt->BO;
+                                temp = CUBE( temp_bo_jt );
+                                pBOjt7 = temp * temp * temp_bo_jt;
+
+                                Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                                        workspace->f_ang );
+                                Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5,
+                                        workspace->f_ang, workspace->f_ang );
+                            }
+
+                            rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                            rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                            /* end angle forces */
+
+                            /* penalty forces */
+                            Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                            Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                            Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                            /* end penalty forces */
+
+                            /* coalition forces */
+                            Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4,
+                                    workspace->f_coa );
+                            Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5,
+                                    workspace->f_coa );
+                            Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                            Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                            Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                            /* end coalition forces */
+#endif
+                        }
+                    }
+                }
+            }
+        }
+
+        Dev_Set_End_Index( pi, num_thb_intrs, thb_intrs );
+    }
+}
+
+
+CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess( reax_atom *atoms,
+        control_params *control, storage p_workspace,
+        reax_list p_bonds, int N )
+{
+    int i, pj;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    reax_list *bonds;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    bonds = &p_bonds;
+    workspace = &p_workspace;
+
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+    {
+        pbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+
+        workspace->CdDelta[i] += sym_index_bond->va_CdDelta;
+
+        //rvec_Add( atoms[i].f, sym_index_bond->va_f );
+        rvec_Add( workspace->f[i], sym_index_bond->va_f );
+    }
+}
+
+
+/* Estimate the num. of three-body interactions */
+CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms,
+        control_params *control, reax_list p_bonds, int n, int N, int *count )
+{
+    int j, pi, pk;
+    int start_j, end_j;
+    int num_thb_intrs;
+    real BOA_ij, BOA_jk;
+    bond_data *pbond_ij, *pbond_jk;
+    bond_order_data *bo_ij, *bo_jk;
+    reax_list *bonds;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= N )
+    {
+        return;
+    }
+
+    bonds = &( p_bonds );
+    start_j = Dev_Start_Index( j, bonds );
+    end_j = Dev_End_Index( j, bonds );
+
+    for ( pi = start_j; pi < end_j; ++pi )
+    {
+        num_thb_intrs = 0;
+        count[ pi ] = 0;
+
+        pbond_ij = &(bonds->select.bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+        if ( BOA_ij > 0.0 &&
+                ( j < n || pbond_ij->nbr < n ) )
+        {
+            for ( pk = start_j; pk < end_j; ++pk )
+            {
+                if ( pk == pi )
+                {
+                    continue;
+                }
+
+                pbond_jk = &(bonds->select.bond_list[pk]);
+                bo_jk = &(pbond_jk->bo_data);
+                BOA_jk = bo_jk->BO - control->thb_cut;
+
+                //CHANGE ORIGINAL
+                //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
+                if ( BOA_jk <= 0.0 )
+                {
+                    continue;
+                }
+                //CHANGE ORIGINAL
+
+                ++num_thb_intrs;
+            }
+
+        }
+
+        count[ pi ] = num_thb_intrs;
+    }
+}
diff --git a/PG-PuReMD/src/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h
similarity index 55%
rename from PG-PuReMD/src/cuda_valence_angles.h
rename to PG-PuReMD/src/cuda/cuda_valence_angles.h
index 13603db4ee1e117294b1c1c882ee54c19c074390..d8abac25aa5b3198faf59784bff5626a8797a32d 100644
--- a/PG-PuReMD/src/cuda_valence_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h
@@ -19,24 +19,18 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __VALENCE_ANGLES_H_
-#define __VALENCE_ANGLES_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-#include "vector.h"
-
-CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *,
-                                      global_parameters ,
-                                      single_body_parameters *,
-                                      three_body_header *,
-                                      control_params *,
-                                      storage ,
-                                      reax_list , reax_list ,
-                                      int, int , int ,
-                                      real *, real *, real *,
-                                      rvec *
-                                    );
+#ifndef __CUDA_VALENCE_ANGLES_H_
+#define __CUDA_VALENCE_ANGLES_H_
+
+#include "../reax_types.h"
+
+#include "../vector.h"
+
+
+CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *, global_parameters,
+        single_body_parameters *, three_body_header *, control_params *,
+        storage, reax_list, reax_list, int, int, int, real *,
+        real *, real *, rvec *);
 
 CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess ( reax_atom *, control_params *,
         storage , reax_list, int );
@@ -45,27 +39,33 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *, control_params *,
         reax_list , int , int, int *);
 
 
-/* calculates the theta angle between i-j-k */
-CUDA_DEVICE inline void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-        real *theta, real *cos_theta )
+/* calculates the angle (theta) between atom triplet i-j-k */
+CUDA_DEVICE static inline void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk,
+        real d_jk, real *theta, real *cos_theta )
 {
     (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-    if ( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if ( *cos_theta < -1. ) *cos_theta  = -1.0;
+
+    if ( *cos_theta > 1. )
+    {
+        *cos_theta  = 1.0;
+    }
+    if ( *cos_theta < -1. )
+    {
+        *cos_theta  = -1.0;
+    }
 
     (*theta) = ACOS( *cos_theta );
 }
 
 
-/* calculates the derivative of the cosine of the angle between i-j-k */
-CUDA_DEVICE inline void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-        rvec* dcos_theta_di,
-        rvec* dcos_theta_dj,
+/* calculates the derivative of the cosine of the angle between atom triplet i-j-k */
+CUDA_DEVICE static inline void Calculate_dCos_Theta( rvec dvec_ji, real d_ji,
+        rvec dvec_jk, real d_jk, rvec* dcos_theta_di, rvec* dcos_theta_dj,
         rvec* dcos_theta_dk )
 {
     int t;
-    real sqr_d_ji = SQR(d_ji);
-    real sqr_d_jk = SQR(d_jk);
+    real sqr_d_ji = SQR( d_ji );
+    real sqr_d_jk = SQR( d_jk );
     real inv_dists = 1.0 / (d_ji * d_jk);
     real inv_dists3 = POW( inv_dists, 3 );
     real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 );
@@ -74,12 +74,13 @@ CUDA_DEVICE inline void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec
     for ( t = 0; t < 3; ++t )
     {
         (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists -
-                              Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
         (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-                              Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
         (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists -
-                              Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
     }
 }
 
+
 #endif
diff --git a/PG-PuReMD/src/validation.cu b/PG-PuReMD/src/cuda/cuda_validation.cu
similarity index 89%
rename from PG-PuReMD/src/validation.cu
rename to PG-PuReMD/src/cuda/cuda_validation.cu
index b614fc890b1280c9687fc8edd0f60d3fdaca2a05..34ebf6e5e3c04aa037fa8f9919b1b5e867fcf783 100644
--- a/PG-PuReMD/src/validation.cu
+++ b/PG-PuReMD/src/cuda/cuda_validation.cu
@@ -1,52 +1,72 @@
-#include "validation.h"
+
+#include "cuda_validation.h"
+
 #include "cuda_utils.h"
-#include "list.h"
-#include "reax_types.h"
 
-#include "index_utils.h"
-#include "vector.h"
+#include "../index_utils.h"
+#include "../list.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
-bool check_zero (real p1, real p2)
+bool check_zero( real p1, real p2 )
 {
-    if (abs (p1 - p2) >= GPU_TOLERANCE)
+    if ( FABS(p1 - p2) >= GPU_TOLERANCE )
+    {
         return true;
+    }
     else 
+    {
         return false;
+    }
 }
 
 
-bool check_zero (rvec p1, rvec p2)
+bool check_zero( rvec p1, rvec p2 )
 {
 
-    if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
-            ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
-            ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
+    if ( ((FABS(p1[0] - p2[0])) >= GPU_TOLERANCE) ||
+            ((FABS(p1[1] - p2[1])) >= GPU_TOLERANCE) ||
+            ((FABS(p1[2] - p2[2])) >= GPU_TOLERANCE) )
+    {
         return true;
-    else return false;
+    }
+    else
+    {
+        return false;
+    }
 }
 
 
-bool check_zero_rvec2 (rvec2 p1, rvec2 p2)
+bool check_zero_rvec2( rvec2 p1, rvec2 p2 )
 {
 
-    if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
-            ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE ))
+    if ( ((FABS(p1[0] - p2[0])) >= GPU_TOLERANCE) ||
+            ((FABS(p1[1] - p2[1])) >= GPU_TOLERANCE) )
+    {
         return true;
-    else return false;
+    }
+    else
+    {
+        return false;
+    }
 }
 
 
-bool check_same (ivec p1, ivec p2)
+bool check_same( ivec p1, ivec p2 )
 {
     if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
+    {
         return true;
+    }
     else 
+    {
         return false;
+    }
 }
 
 
-void print_bond_data (bond_order_data *s)
+void print_bond_data( bond_order_data *s )
 {
     /*   
          fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
@@ -55,13 +75,13 @@ void print_bond_data (bond_order_data *s)
          s->BO_pi,
          s->BO_pi2 );
      */
-    fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
-    fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
-    fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
+    fprintf( stderr, " Cdbo (%e) ", s->Cdbo );
+    fprintf( stderr, " Cdbopi (%e) ", s->Cdbopi );
+    fprintf( stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
 }
 
 
-int validate_neighbors (reax_system *system, reax_list **lists)
+int validate_neighbors( reax_system *system, reax_list **lists )
 {
     reax_list *far_nbrs = *lists + FAR_NBRS;
     reax_list *d_nbrs = *dev_lists + FAR_NBRS;
@@ -408,26 +428,34 @@ int validate_sparse_matrix( reax_system *system, storage *workspace )
         }
     }
 
-    fprintf (stderr, "Sparse Matrix mismatch total: %d, miscount %d  \n", total, count);
-    free (test.start);
-    free (test.end);
-    free (test.entries);
+    fprintf( stderr, "Sparse Matrix mismatch total: %d, miscount %d  \n",
+            total, count );
+    free( test.start );
+    free( test.end );
+    free( test.entries );
     return SUCCESS;
 }
 
-bool print_hbonds (int *d_start, int *d_end, int i, hbond_data *data)
+
+void print_hbonds( int *d_start, int *d_end, int i, hbond_data *data )
 {
+    int j;
     hbond_data src, tgt; 
 
-    fprintf (stderr, " start %d end %d count ---> %d \n", d_start[i], d_end[i], d_end[i] - d_start[i]);   
+    fprintf( stderr, " start %d end %d count ---> %d \n",
+            d_start[i], d_end[i], d_end[i] - d_start[i] );
 
-    for (int j = d_start[i]; j < d_end[i]; j++) 
-        fprintf (stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n", i, data[j].nbr, data[j].scl);
-    fprintf (stderr, " ========================================= \n");
+    for ( j = d_start[i]; j < d_end[i]; j++ )
+    {
+        fprintf( stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n",
+                i, data[j].nbr, data[j].scl );
+    }
+    fprintf( stderr, " ========================================= \n" );
 }
 
 
-int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
+int validate_hbonds( reax_system *system, storage *workspace,
+        reax_list **lists )
 {
     int count, nbr, sym_count, dev_count;
     int *d_start, *d_end, index, d_index;
@@ -452,7 +480,7 @@ int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
     sym_count = 0;
     for (int i = 0; i < system->n; i++) {
 
-        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == 1 )
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM )
         {
             count += End_Index (i, hbonds) - Start_Index (i, hbonds);
             dev_count += d_end [i] - d_start[i];
@@ -463,8 +491,8 @@ int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
                         d_start[i], d_end[ i],
                         Start_Index (i, hbonds),
                         End_Index (i, hbonds) );
-                print_hbonds (d_start, d_end, i, data);
-                print_hbonds (hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list);
+                print_hbonds( d_start, d_end, i, data );
+                print_hbonds( hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list );
                 exit (-1);
             }
         }
@@ -476,13 +504,13 @@ int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
     sym_count = 0;
 
     for (int i = system->n; i < system->N; i++) {
-        //if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == 2)
+        //if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == H_BONDING_ATOM )
         {
             sym_count += d_end[i] - d_start[i];
         }
     }
     fprintf (stderr, "Sym count outside 'n' : %d \n", sym_count );
-    //print_hbonds (d_start, d_end, 0, data);
+    //print_hbonds( d_start, d_end, 0, data );
 
 
     count = 0;
@@ -495,7 +523,7 @@ int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
            d_end[d_index] - d_start[d_index]);
          */
 
-        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != 1 )
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != H_ATOM )
         {
             /*
                int x;
@@ -575,11 +603,11 @@ int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
             if ( k >= (End_Index (i, hbonds) )){
                 fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j);
                 fprintf (stderr, " ==========Host============ \n");
-                print_hbonds (hbonds->index, hbonds->end_index, 
-                        i, hbonds->select.hbond_list);
+                print_hbonds( hbonds->index, hbonds->end_index,
+                        i, hbonds->select.hbond_list );
                 fprintf (stderr, " ==========Device============ \n");
-                print_hbonds (d_start, d_end, 
-                        i, data);
+                print_hbonds( d_start, d_end,
+                        i, data );
                 exit (-1);
             }
         }
@@ -763,7 +791,7 @@ int validate_bonds (reax_system *system, storage *workspace, reax_list **lists)
     return SUCCESS;
 }
 
-int validate_workspace (reax_system *system, storage *workspace)
+int validate_workspace( reax_system *system, storage *workspace )
 {
     int miscount;
     int count, tcount;
@@ -838,13 +866,15 @@ int validate_workspace (reax_system *system, storage *workspace)
             count ++;
         }    
     }
-    free (deltap_boc);
+    free( deltap_boc );
     fprintf (stderr, "Deltap_boc mismatch count %d\n", count);
 
 
     rvec *dDeltap_self;
-    dDeltap_self = (rvec *) calloc (system->N, sizeof (rvec) );
-    copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self");
+    dDeltap_self = (rvec *) scalloc( system->N, sizeof (rvec),
+            "validate_workspace::dDeltap_self" );
+    copy_host_device( dDeltap_self, dev_workspace->dDeltap_self,
+            system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self" );
 
     count = 0; 
     for (int i = 0; i < system->N; i++ )
@@ -1086,18 +1116,18 @@ int validate_workspace (reax_system *system, storage *workspace)
     /////////////////////////////////////////////////////
     //QEq part
     /////////////////////////////////////////////////////
-    compare_rvec2 (workspace->d2, dev_workspace->d2, system->N, "d2");
+    compare_rvec2( workspace->d2, dev_workspace->d2, system->N, "d2" );
 
-    compare_rvec2 (workspace->q2, dev_workspace->q2, system->N, "q2");
+    compare_rvec2( workspace->q2, dev_workspace->q2, system->N, "q2" );
 
-    compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
+    compare_rvec2( workspace->x, dev_workspace->x, system->N, "x" );
 
-    compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
+    compare_rvec2( workspace->b, dev_workspace->b, system->N, "b" );
 
     return SUCCESS;
 }
 
-void compare_rvec2( rvec2 *host, rvec2 *device, int N, char *msg)
+void compare_rvec2( rvec2 *host, rvec2 *device, int N, const char *msg)
 {
     int count = 0;
     int miscount = 0;
@@ -1117,7 +1147,7 @@ void compare_rvec2( rvec2 *host, rvec2 *device, int N, char *msg)
     fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount);
 }
 
-void compare_array( real *host, real *device, int N, char *msg)
+void compare_array( real *host, real *device, int N, const char *msg )
 {
     int count = 0;
     int miscount = 0;
@@ -1462,84 +1492,84 @@ int validate_three_bodies (reax_system *system, storage *workspace, reax_list **
            print_bond_data (src);
            fprintf (stderr, "\n");
 
-//fprintf (stderr, "--- Device bo is %f \n", test[j]);
-fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],
-Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
-fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
-fprintf (stderr, "------\n");
-}
-fprintf (stderr, " Three Bodies count does not match between host and device \n");
-exit (-1);
-}
+        //fprintf (stderr, "--- Device bo is %f \n", test[j]);
+        fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],
+        Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
+        fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
+        fprintf (stderr, "------\n");
+        }
+        fprintf (stderr, " Three Bodies count does not match between host and device \n");
+        exit (-1);
+        }
          */
-}
-fprintf (stderr, "Three body count on DEVICE %d  HOST %d -- miscount: %d\n", dcount, hcount, count);
+    }
+    fprintf (stderr, "Three body count on DEVICE %d  HOST %d -- miscount: %d\n", dcount, hcount, count);
 
-count = 0;
-for (int i = 0; i < system->N; i++)
-{
-    int x, y, z;
-    for (x = b_start[i]; x < b_end[i]; x++)
+    count = 0;
+    for (int i = 0; i < system->N; i++)
     {
-        int t_start = start[x];
-        int t_end = end[x];
-
-        bond_data *dev_bond = &d_bond_data [x];
-        bond_data *host_bond;
-        for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
+        int x, y, z;
+        for (x = b_start[i]; x < b_end[i]; x++)
         {
-            host_bond = &bonds->select.bond_list [z];
-            if ((dev_bond->nbr == host_bond->nbr) &&
-                    check_same (dev_bond->rel_box, host_bond->rel_box) &&
-                    !check_zero (dev_bond->dvec, host_bond->dvec) &&
-                    !check_zero (dev_bond->d, host_bond->d) )
+            int t_start = start[x];
+            int t_end = end[x];
+
+            bond_data *dev_bond = &d_bond_data [x];
+            bond_data *host_bond;
+            for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
             {
-                break;
+                host_bond = &bonds->select.bond_list [z];
+                if ((dev_bond->nbr == host_bond->nbr) &&
+                        check_same (dev_bond->rel_box, host_bond->rel_box) &&
+                        !check_zero (dev_bond->dvec, host_bond->dvec) &&
+                        !check_zero (dev_bond->d, host_bond->d) )
+                {
+                    break;
+                }
+            }
+            if (z >= End_Index (i, bonds)){
+                fprintf (stderr, "Could not find the matching bond on host and device \n");
+                exit (-1);
             }
-        }
-        if (z >= End_Index (i, bonds)){
-            fprintf (stderr, "Could not find the matching bond on host and device \n");
-            exit (-1);
-        }
 
-        //find this three-body in the bonds on the host side.
-        for (y = t_start; y < t_end; y++)
-        {
-            three_body_interaction_data *device = data + y;
-            three_body_interaction_data *host;
+            //find this three-body in the bonds on the host side.
+            for (y = t_start; y < t_end; y++)
+            {
+                three_body_interaction_data *device = data + y;
+                three_body_interaction_data *host;
 
-            //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
+                //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
 
-            int xx;
-            for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
-            {
-                host = &three->select.three_body_list [xx];
-                //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
-                //if ((host->thb == device->thb) && (host->pthb == device->pthb))
-                if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
+                int xx;
+                for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
                 {
-                    count ++;
-                    break;
+                    host = &three->select.three_body_list [xx];
+                    //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
+                    //if ((host->thb == device->thb) && (host->pthb == device->pthb))
+                    if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
+                    {
+                        count ++;
+                        break;
+                    }
                 }
-            }
 
-            if ( xx >= End_Index (z, three) ) {
-                fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z,
-                        Start_Index (z, three), End_Index (z, three), start[x], end[x] );
-                exit (-1);
-            }// else fprintf (stderr, "----------------- \n");
+                if ( xx >= End_Index (z, three) ) {
+                    fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z,
+                            Start_Index (z, three), End_Index (z, three), start[x], end[x] );
+                    exit (-1);
+                }// else fprintf (stderr, "----------------- \n");
+            }
         }
     }
-}
-free (data);
-free (start);
-free (end);
-free (b_start);
-free (b_end);
-free (d_bond_data);
-
-fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
-return SUCCESS;
+    free (data);
+    free (start);
+    free (end);
+    free (b_start);
+    free (b_end);
+    free (d_bond_data);
+
+    fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
+    return SUCCESS;
 }
 
 
@@ -1613,16 +1643,16 @@ int print_sparse_matrix (sparse_matrix *H)
     sparse_matrix test;
     int index, count;
 
-    test.start = (int *) malloc (sizeof (int) * (H->cap)); 
-    test.end = (int *) malloc (sizeof (int) * (H->cap)); 
+    test.start = (int *) malloc (sizeof (int) * (H->n)); 
+    test.end = (int *) malloc (sizeof (int) * (H->n)); 
 
     test.entries = (sparse_matrix_entry *) malloc (sizeof (sparse_matrix_entry) * (H->m));
     memset (test.entries, 0xFF, sizeof (sparse_matrix_entry) * H->m);
 
     copy_host_device ( test.entries, dev_workspace->H.entries, 
             sizeof (sparse_matrix_entry) * H->m, cudaMemcpyDeviceToHost, "H:m");
-    copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->cap), cudaMemcpyDeviceToHost, "H:start");
-    copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->cap), cudaMemcpyDeviceToHost, "H:end");
+    copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->n), cudaMemcpyDeviceToHost, "H:start");
+    copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->n), cudaMemcpyDeviceToHost, "H:end");
 
     count = 0; 
     for (int i = 0; i < 1; i++) {
@@ -1674,51 +1704,65 @@ int print_device_rvec2 (rvec2 *b, int n)
 }
 
 
-int print_host_array (real *a, int n)
+int print_host_array( real *a, int n )
 {
+    int i;
 
-    for (int i = 0; i < n; i++)
+    for ( i = 0; i < n; i++ )
     {
-        fprintf (stderr," a[%d] = %f \n", i, a[i]);
+        fprintf( stderr," a[%d] = %f \n", i, a[i] );
     }
-    fprintf(stderr, " ----------------------------------\n");
+    fprintf( stderr, " ----------------------------------\n" );
 
     return SUCCESS;
 }
 
 
-int print_device_array (real *a, int n)
+int print_device_array( real *a, int n )
 {
     real *b = (real *) host_scratch;
-    copy_host_device (b, a, sizeof (real) * n, cudaMemcpyDeviceToHost, "real");
-    print_host_array (b, n);
+    copy_host_device( b, a, sizeof(real) * n,
+            cudaMemcpyDeviceToHost, "real");
+    print_host_array( b, n );
 
     return SUCCESS;
 }
 
 
-int check_zeros_host (rvec2 *host, int n, char *msg)
+int check_zeros_host( rvec2 *host, int n, const char *msg )
 {
-    int count, count1;
-    count = count1 = 0;
-    for (int i = 0; i < n; i++){
-        if (host[i][0] == 0) count ++;
-        if (host[i][1] == 0) count1 ++;
+    int i, count, count1;
+
+    count = 0;
+    count1 = 0;
+
+    for ( i = 0; i < n; i++ )
+    {
+        if (host[i][0] == 0)
+        {
+            count++;
+        }
+        if (host[i][1] == 0)
+        {
+            count1++;
+        }
     }
 
-    fprintf (stderr, "%s has %d, %d zero elements \n", msg, count, count1 );
+    fprintf( stderr, "%s has %d, %d zero elements \n",
+            msg, count, count1 );
 
-    return 1;
+    return SUCCESS;
 }
 
 
-int check_zeros_device (rvec2 *device, int n, char *msg)
+int check_zeros_device( rvec2 *device, int n, const char *msg )
 {
     rvec2 *a = (rvec2 *) host_scratch;    
 
-    copy_host_device (a, device, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, msg);
+    copy_host_device( a, device, sizeof(rvec2) * n,
+            cudaMemcpyDeviceToHost, msg );
 
-    check_zeros_host (a, n, msg);
+    check_zeros_host( a, n, msg );
 
-    return 1;
+    return SUCCESS;
 }
diff --git a/PG-PuReMD/src/cuda/cuda_validation.h b/PG-PuReMD/src/cuda/cuda_validation.h
new file mode 100644
index 0000000000000000000000000000000000000000..7faa773ba5567bdc9c8d6bc24caee06ff2f9be44
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_validation.h
@@ -0,0 +1,62 @@
+
+
+#ifndef __CUDA_VALIDATION_H__
+#define __CUDA_VALIDATION_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int validate_neighbors( reax_system *, reax_list **lists );
+
+int validate_sym_dbond_indices( reax_system *system,
+        storage *workspace, reax_list **lists );
+
+int validate_bonds( reax_system *, storage *, reax_list ** );
+
+int validate_hbonds( reax_system *, storage *, reax_list ** );
+
+int validate_sparse_matrix( reax_system *, storage * );
+
+int validate_grid( reax_system * );
+
+int validate_workspace( reax_system *, storage * );
+
+int validate_data( reax_system *, simulation_data * );
+
+int validate_three_bodies( reax_system *, storage *,
+        reax_list ** );
+
+int validate_atoms( reax_system *, reax_list ** );
+
+int print_sparse_matrix( sparse_matrix *H );
+
+int print_sparse_matrix_host( sparse_matrix *H );
+
+int print_host_rvec2( rvec2 *, int );
+
+int print_device_rvec2( rvec2 *, int );
+
+int print_host_array( real *, int );
+
+int print_device_array( real *, int );
+
+void compare_rvec2( rvec2 *host, rvec2 *device, int N,
+        const char *msg );
+
+void compare_array( real *host, real *device, int N,
+        const char *msg );
+
+int check_zeros_host( rvec2 *host, int n, const char * );
+
+int check_zeros_device( rvec2 *device, int n, const char * );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_qEq.h b/PG-PuReMD/src/cuda/cuda_vector.h
similarity index 73%
rename from PG-PuReMD/src/cuda_qEq.h
rename to PG-PuReMD/src/cuda/cuda_vector.h
index 287302abecaeb761d7615e25a9d3b624ae21dcd7..ef526938835945733143e35bc9ce8866959ca580 100644
--- a/PG-PuReMD/src/cuda_qEq.h
+++ b/PG-PuReMD/src/cuda/cuda_vector.h
@@ -19,23 +19,23 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __QEq_H_
-#define __QEq_H_
+#ifndef __CUDA_VECTOR_H_
+#define __CUDA_VECTOR_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void Cuda_Init_MatVec (reax_system *, storage *);
+#include "cuda_random.h"
 
-void cuda_charges_x (reax_system *, rvec2 );
-void cuda_charges_st (reax_system *, storage *, real *, real );
-void cuda_charges_updateq (reax_system *, real *);
 
-#ifdef __cplusplus
+CUDA_DEVICE static inline void cuda_rvec_Random( rvec v )
+{
+//    v[0] = Cuda_Random( 2.0 ) - 1.0;
+//    v[1] = Cuda_Random( 2.0 ) - 1.0;
+//    v[2] = Cuda_Random( 2.0 ) - 1.0;
+    v[0] = 0.0;
+    v[1] = 0.0;
+    v[2] = 0.0;
 }
-#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda_bond_orders.cu
deleted file mode 100644
index 05257c9409894b28399aaaff0f789758da592426..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_bond_orders.cu
+++ /dev/null
@@ -1,820 +0,0 @@
-
-#include "cuda_bond_orders.h"
-
-#include "dev_list.h"
-#include "index_utils.h"
-#include "bond_orders.h"
-#include "cuda_utils.h"
-#include "reduction.h"
-
-CUDA_GLOBAL void Cuda_Calculate_BO_init (  reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        storage p_workspace, 
-        int N )
-{
-    int i, type_i;
-    single_body_parameters *sbp_i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    storage *workspace = & (p_workspace);
-
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    type_i = my_atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-    workspace->Deltap_boc[i] = 
-        workspace->total_bond_order[i] - sbp_i->valency_val;
-    workspace->total_bond_order[i] = 0; 
-}
-
-CUDA_GLOBAL void Cuda_Calculate_BO (  reax_atom *my_atoms, global_parameters gp, 
-        single_body_parameters *sbp, two_body_parameters *tbp, 
-        storage p_workspace, reax_list p_bonds, 
-        int num_atom_types, int N )
-{
-    int i, j, pj, type_i, type_j;
-    int start_i, end_i, sym_index, num_bonds;
-    real val_i, Deltap_i, Deltap_boc_i;
-    real val_j, Deltap_j, Deltap_boc_j;
-    real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-    real exp_p1i,   exp_p2i, exp_p1j, exp_p2j;
-    real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-    real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji
-    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-    real explp1, p_boc1, p_boc2;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij, *bo_ji;
-
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    storage *workspace = & (p_workspace);
-    reax_list *bonds = &(p_bonds);
-
-    num_bonds = 0; 
-    p_boc1 = gp.l[0];
-    p_boc2 = gp.l[1];
-
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    /*
-    //for( i = 0; i < system->N; ++i ) {
-    type_i = my_atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-    workspace->Deltap_boc[i] = 
-    workspace->total_bond_order[i] - sbp_i->valency_val;
-
-    //fprintf( stdout, "%d(%d) %24.15f\n", 
-    //     i, workspace->bond_mark[i], workspace->total_bond_order[i] );
-    workspace->total_bond_order[i] = 0; 
-    //}
-     */
-
-    /* Corrected Bond Order calculations */
-    //for( i = 0; i < system->N; ++i ) {
-    type_i = my_atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    val_i = sbp_i->valency;
-    Deltap_i = workspace->Deltap[i];
-    Deltap_boc_i = workspace->Deltap_boc[i];
-    start_i = Dev_Start_Index(i, bonds);
-    end_i = Dev_End_Index(i, bonds);
-    // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-    for( pj = start_i; pj < end_i; ++pj ) {
-        j = bonds->select.bond_list[pj].nbr;
-        type_j = my_atoms[j].type;
-        bo_ij = &( bonds->select.bond_list[pj].bo_data );
-        // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
-
-        //TODO
-        //TODO
-        //TODO
-        //TODO
-        //TODO
-        //if( i < j || workspace->bond_mark[j] > 3 ) {
-        if( i < j ) {
-            twbp = &( tbp[ index_tbp (type_i, type_j, num_atom_types)] );
-
-#ifdef TEST_FORCES
-            Set_Start_Index( pj, top_dbo, dBOs );
-            /* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", 
-               workspace->reverse_map[i], workspace->reverse_map[j],
-               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
-#endif
-
-            if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
-                /* There is no correction to bond orders nor to derivatives
-                   of bond order prime! So we leave bond orders unchanged and
-                   set derivative of bond order coefficients such that 
-                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-                bo_ij->C1dbo = 1.000000;
-                bo_ij->C2dbo = 0.000000;
-                bo_ij->C3dbo = 0.000000;
-
-                bo_ij->C1dbopi = bo_ij->BO_pi;
-                bo_ij->C2dbopi = 0.000000;
-                bo_ij->C3dbopi = 0.000000;
-                bo_ij->C4dbopi = 0.000000;
-
-                bo_ij->C1dbopi2 = bo_ij->BO_pi2;
-                bo_ij->C2dbopi2 = 0.000000;
-                bo_ij->C3dbopi2 = 0.000000;
-                bo_ij->C4dbopi2 = 0.000000;
-
-#ifdef TEST_FORCES
-                pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
-
-                // compute dBO_ij/dr_i
-                pdbo->wrt = i;
-                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
-
-                // compute dBO_ij/dr_j
-                pdbo++;
-                pdbo->wrt = j;
-                rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
-                rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-                rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
-
-                top_dbo += 2;
-#endif
-            }
-            else {
-                val_j = sbp[type_j].valency;
-                Deltap_j = workspace->Deltap[j];
-                Deltap_boc_j = workspace->Deltap_boc[j];
-
-                /* on page 1 */
-                if( twbp->ovc >= 0.001 ) {
-                    /* Correction for overcoordination */
-                    exp_p1i = EXP( -p_boc1 * Deltap_i );
-                    exp_p2i = EXP( -p_boc2 * Deltap_i );
-                    exp_p1j = EXP( -p_boc1 * Deltap_j );
-                    exp_p2j = EXP( -p_boc2 * Deltap_j );
-
-                    f2 = exp_p1i + exp_p1j;
-                    f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) +
-                            ( val_j + f2 )/( val_j + f2 + f3 ) );
-
-
-                    /*fprintf( stderr,"%d %d\t%g %g   j:%g %g  p_boc:%g %g\n"
-                      "\tf:%g  %g  %g, exp:%g %g %g %g\n", 
-                      i+1, j+1, 
-                      val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2,
-                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-
-                    /* Now come the derivates */
-                    /* Bond Order pages 5-7, derivative of f1 */
-                    temp = f2 + f3;
-                    u1_ij = val_i + temp;
-                    u1_ji = val_j + temp;
-                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) +
-                            1.0 / SQR( u1_ji ));
-                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
-                            ( u1_ji - f3 ) / SQR( u1_ji ));
-
-                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
-                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
-                            ((val_i+f2) / SQR(u1_ij)) *
-                            ( -p_boc1 * exp_p1i +
-                              exp_p2i / ( exp_p2i + exp_p2j ) ) +
-                            -p_boc1 * exp_p1i / u1_ji -
-                            ((val_j+f2) / SQR(u1_ji)) *
-                            ( -p_boc1 * exp_p1i +
-                              exp_p2i / ( exp_p2i + exp_p2j ) ));
-
-
-                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
-                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
-
-                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
-                }
-                else {
-                    /* No overcoordination correction! */
-                    f1 = 1.0;
-                    Cf1_ij = Cf1_ji = 0.0;
-                }
-
-                if( twbp->v13cor >= 0.001 ) {
-                    /* Correction for 1-3 bond orders */
-                    exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-                    exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-
-                    f4 = 1. / (1. + exp_f4);
-                    f5 = 1. / (1. + exp_f5);
-                    f4f5 = f4 * f5;
-
-                    /* Bond Order pages 8-9, derivative of f4 and f5 */
-                    /*temp = twbp->p_boc5 - 
-                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-                    Cf45_ij = -f4 * exp_f4;
-                    Cf45_ji = -f5 * exp_f5;
-                }
-                else {
-                    f4 = f5 = f4f5 = 1.0;
-                    Cf45_ij = Cf45_ji = 0.0;
-                }
-
-                /* Bond Order page 10, derivative of total bond order */
-                A0_ij = f1 * f4f5;
-                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
-                    (Cf45_ij + Cf45_ji);
-                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-                A3_ij = A2_ij + Cf1_ij / f1;
-                A3_ji = A2_ji + Cf1_ji / f1;
-
-                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" 
-                  "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
-                  bo_ij->BO, 
-                  A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
-
-
-                /* find corrected bond orders and their derivative coef */
-                bo_ij->BO    = bo_ij->BO    * A0_ij;
-                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
-                bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
-                bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-                bo_ij->C2dbo = bo_ij->BO * A2_ij;
-                bo_ij->C3dbo = bo_ij->BO * A2_ji;
-
-                bo_ij->C1dbopi = f1*f1*f4*f5;
-                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
-
-                bo_ij->C1dbopi2 = f1*f1*f4*f5;
-                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
-                bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
-
-                //CHANGE ORIGINAL
-            }
-            //CHANGE ORIGINAL
-
-            /* neglect bonds that are < 1e-10 */
-            if( bo_ij->BO < 1e-10 )
-                bo_ij->BO = 0.0;
-            if( bo_ij->BO_s < 1e-10 )
-                bo_ij->BO_s = 0.0;
-            if( bo_ij->BO_pi < 1e-10 )
-                bo_ij->BO_pi = 0.0;
-            if( bo_ij->BO_pi2 < 1e-10 )
-                bo_ij->BO_pi2 = 0.0;
-
-            workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
-
-
-            /* fprintf( stderr, "%d %d\t%g %g %g %g\n"
-               "Cdbo:\t%g %g %g\n"
-               "Cdbopi:\t%g %g %g %g\n"
-               "Cdbopi2:%g %g %g %g\n\n", 
-               i+1, j+1, 
-               bonds->select.bond_list[ pj ].d, 
-               bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
-               bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
-               bo_ij->C1dbopi, bo_ij->C2dbopi, 
-               bo_ij->C3dbopi, bo_ij->C4dbopi,
-               bo_ij->C1dbopi2,bo_ij->C2dbopi2, 
-               bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
-
-            /* fprintf( stderr, "%d %d  BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/
-
-#ifdef TEST_FORCES
-            Set_End_Index( pj, top_dbo, dBOs );
-            Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
-#endif
-            //CHANGE ORIGINAL
-            //}
-            //CHANGE ORIGINAL
-            /*
-               else {
-            // We only need to update bond orders from bo_ji
-            //   everything else is set in uncorrected_bo calculations
-            sym_index = bonds->select.bond_list[pj].sym_index;
-            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-            bo_ij->BO = bo_ji->BO;
-            bo_ij->BO_s = bo_ji->BO_s;
-            bo_ij->BO_pi = bo_ji->BO_pi;
-            bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
-#ifdef TEST_FORCES
-Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
-#endif
-}
-             */
-            }
-}
-//} COMMENTED FOR CUDA KERNEL
-}
-
-CUDA_GLOBAL void Cuda_Update_Uncorrected_BO (  storage p_workspace, reax_list p_bonds, int N )
-{
-    int i, j, pj;
-    int start_i, end_i;
-    int sym_index;
-    storage *workspace = &( p_workspace );
-    reax_list *bonds = &( p_bonds );
-
-    bond_order_data *bo_ij, *bo_ji;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    start_i = Dev_Start_Index(i, bonds);
-    end_i = Dev_End_Index(i, bonds);
-
-    for( pj = start_i; pj < end_i; ++pj ) {
-
-        j = bonds->select.bond_list[pj].nbr;
-        bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-        //if( (i >= j)  || (workspace->bond_mark [i] <= 3)) {
-        if( (i >= j) ) {
-
-            /* We only need to update bond orders from bo_ji
-               everything else is set in uncorrected_bo calculations */
-            sym_index = bonds->select.bond_list[pj].sym_index;
-            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-            bo_ij->BO = bo_ji->BO;
-            bo_ij->BO_s = bo_ji->BO_s;
-            bo_ij->BO_pi = bo_ji->BO_pi;
-            bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
-        }
-    }
-    }
-
-    CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *my_atoms, global_parameters gp, 
-            single_body_parameters *sbp, storage p_workspace, 
-            int N)
-    {
-        int j, type_j;
-        real explp1;
-        real p_lp1;
-        single_body_parameters *sbp_i, *sbp_j;
-        storage *workspace = &( p_workspace );
-
-        j = blockIdx.x * blockDim.x + threadIdx.x;
-        if (j >= N) return;
-
-        p_lp1 = gp.l[15];
-        /* Calculate some helper variables that are  used at many places
-           throughout force calculations */
-        //for( j = 0; j < system->N; ++j ){
-        type_j = my_atoms[j].type;
-        sbp_j = &(sbp[ type_j ]);
-
-        workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
-        workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
-        workspace->Delta_boc[j] = workspace->total_bond_order[j] -
-            sbp_j->valency_boc;
-
-        workspace->vlpex[j] = workspace->Delta_e[j] -
-            2.0 * (int)(workspace->Delta_e[j]/2.0);
-        explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
-        workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
-        workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
-        workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
-        /* Adri uses different dDelta_lp values than the ones in notes... */
-        workspace->dDelta_lp[j] = workspace->Clp[j];
-        //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-        //((fabs(workspace->Delta_e[j]/2.0 -
-        //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-
-        if( sbp_j->mass > 21.0 ) {
-            workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-            workspace->dDelta_lp_temp[j] = 0.;
-        }
-        else {
-            workspace->nlp_temp[j] = workspace->nlp[j];
-            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-            workspace->dDelta_lp_temp[j] = workspace->Clp[j];
-        }
-        //} Commented for Cuda
-    }
-
-
-    CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
-            storage *workspace, reax_list *bonds, rvec data_ext_press)
-    {
-        bond_data *nbr_j, *nbr_k;
-        bond_order_data *bo_ij, *bo_ji;
-        dbond_coefficients coef;
-        rvec temp, ext_press;
-        ivec rel_box;
-        int pk, k, j;
-        rvec tf_f;
-
-        /* Initializations */
-        nbr_j = &(bonds->select.bond_list[pj]);
-        j = nbr_j->nbr;
-
-        //bo_ij = &(nbr_j->bo_data);
-        //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-        if (i < j) {
-            bo_ij = &(nbr_j->bo_data);
-            bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-        } else {
-            bo_ji = &(nbr_j->bo_data);
-            bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-        }
-
-        coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-        coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-        coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-        coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-        coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-        coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-        coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-        coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-
-        /************************************
-         * forces related to atom i          *
-         * first neighbors of atom i         *
-         ************************************/
-        if (i < j) {
-            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-                nbr_k = &(bonds->select.bond_list[pk]);
-                k = nbr_k->nbr;
-
-                rvec_MakeZero (nbr_k->tf_f);
-
-                rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp);       /*2nd, dBO*/
-                rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
-                rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/
-                rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/
-
-                /* force */
-                rvec_Add( nbr_k->tf_f, temp );
-                /* pressure */
-                rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-                rvec_Add( data_ext_press, ext_press );
-
-                /* if( !ivec_isZero( nbr_k->rel_box ) )
-                   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
-                   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-                   i+1, system->my_atoms[i].x[0], 
-                   system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
-                   j+1, k+1, system->my_atoms[k].x[0], 
-                   system->my_atoms[k].x[1], system->my_atoms[k].x[2],
-                   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-                   nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
-                   temp[0], temp[1], temp[2] ); */
-            }
-
-            /* then atom i itself  */
-            rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st,dBO*/
-            rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd,dBO*/
-            rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st,dBO*/
-            rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/
-            rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
-            rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
-            rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/
-
-            rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );  /*1st,dBO_pi2*/
-            rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );         /*2nd,dBO_pi2*/
-            rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/
-
-            /* force */
-            rvec_Add( workspace->f[i], temp );
-            /* ext pressure due to i is dropped, counting force on j will be enough */
-        }
-        else {
-
-            /******************************************************
-             * forces and pressure related to atom j               * 
-             * first neighbors of atom j                           *
-             ******************************************************/
-            for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk ) {
-                nbr_k = &(bonds->select.bond_list[pk]);
-                k = nbr_k->nbr;
-
-                rvec_MakeZero (nbr_k->tf_f);
-
-                rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );      /*3rd,dBO*/
-                rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
-                rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/
-                rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/
-
-                /* force */
-                rvec_Add( nbr_k->tf_f, temp );
-                /* pressure */
-                if( k != i ) {
-                    ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i)
-                    rvec_iMultiply( ext_press, rel_box, temp );
-                    rvec_Add( data_ext_press, ext_press );
-
-                    /* if( !ivec_isZero( rel_box ) )
-                       fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
-                       "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-                       i+1, j+1, system->my_atoms[j].x[0], 
-                       system->my_atoms[j].x[1], system->my_atoms[j].x[2], 
-                       k+1, system->my_atoms[k].x[0], 
-                       system->my_atoms[k].x[1], system->my_atoms[k].x[2],
-                       nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-                       rel_box[0], rel_box[1], rel_box[2],
-                       temp[0], temp[1], temp[2] ); */
-                }
-            }
-
-            /* then atom j itself */
-            rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                    /*1st, dBO*/
-            rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );  /*2nd, dBO*/
-            rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );             /*1st, dBO*/
-            rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/
-
-            rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );       /*1st,dBOpi*/
-            rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );             /*2nd,dBOpi*/
-            rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/
-
-            rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );    /*1st,dBOpi2*/
-            rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );           /*2nd,dBOpi2*/
-            rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
-
-            /* force */
-            rvec_Add( workspace->f[j], temp );
-            /* pressure */
-            rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-            rvec_Add( data->my_ext_press, ext_press );
-
-            /* if( !ivec_isZero( nbr_j->rel_box ) )
-               fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" 
-               "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-               i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], 
-               system->my_atoms[i].x[2], 
-               j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], 
-               system->my_atoms[j].x[2],
-               j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
-               nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
-               temp[0], temp[1], temp[2] ); */
-        }
-    }
-
-    CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
-            storage *workspace, reax_list *bonds )
-    {
-        bond_data *nbr_j, *nbr_k;
-        bond_order_data *bo_ij, *bo_ji;
-        dbond_coefficients coef;
-        int pk, k, j;
-
-        rvec tf_f;
-        rvec_MakeZero (tf_f);
-
-        /* Initializations */
-        nbr_j = &(bonds->select.bond_list[pj]);
-        j = nbr_j->nbr;
-        //bo_ij = &(nbr_j->bo_data);
-        //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-
-        if (i < j) {
-            bo_ij = &(nbr_j->bo_data);
-            bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-        } else {
-            bo_ji = &(nbr_j->bo_data);
-            bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-        }
-
-        coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-        coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-        coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-        coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-        coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-        coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-        coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-        coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-        coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-        coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-        if (i < j) {
-            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-                nbr_k = &(bonds->select.bond_list[pk]);
-                k = nbr_k->nbr;
-                rvec_MakeZero (tf_f);
-
-                /*2nd,dBO*/
-                rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp );
-                /*dDelta*/
-                rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
-                /*3rd, dBOpi*/
-                rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
-                /*3rd, dBOpi2*/
-                rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
-
-                //Temp storage
-                rvec_Add (nbr_k->tf_f, tf_f);
-            }
-            /*1st, dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp );
-            /*2nd, dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] );
-
-            /*1st, dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp );
-            /*2nd, dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] );
-
-            /*1st, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi );
-            /*2nd, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp );
-            /*3rd, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] );
-
-            /*1st, dBO_pi2*/
-            rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-            /*2nd, dBO_pi2*/
-            rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp );
-            /*3rd, dBO_pi2*/
-            rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] );
-
-        } else {
-
-            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-                nbr_k = &(bonds->select.bond_list[pk]);
-                k = nbr_k->nbr;
-                rvec_MakeZero (tf_f);
-
-                /*3rd, dBO*/
-                rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
-                /*dDelta*/
-                rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
-                /*4th, dBOpi*/
-                rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
-                /*4th, dBOpi2*/
-                rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
-
-                //Temp Storage
-                rvec_Add (nbr_k->tf_f, tf_f);
-            }
-
-            /*1st,dBO*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp );
-            /*2nd,dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] );
-
-            /*1st, dBO*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp );
-            /*2nd, dBO*/
-            rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] );
-
-            /*1st, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi );
-            /*2nd, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp );
-            /*3rd, dBOpi*/
-            rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] );
-
-            /*1st, dBOpi2*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-            /*2nd, dBOpi2*/
-            rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp );
-            /*3rd, dBOpi2*/
-            rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] );
-        }
-    }
-
-    CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, reax_list *bonds, storage *workspace)
-    {
-        int pk;
-        bond_data *nbr_k, *nbr_k_sym;
-
-        for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-            nbr_k = &(bonds->select.bond_list[pk]);
-            nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
-
-            //rvec_Add (atoms[i].f, nbr_k_sym->tf_f);
-            rvec_Add (workspace->f[i], nbr_k_sym->tf_f);
-        }
-    }
-
-    CUDA_GLOBAL void ker_total_forces_postprocess (reax_atom *my_atoms, reax_list p_bonds, storage p_workspace,  int N)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= N) return;
-
-        reax_list *bonds = &( p_bonds );
-        storage *workspace = &( p_workspace );
-        Cuda_dbond_to_Forces_postprocess (i, my_atoms, bonds, workspace );
-    }
-
-    CUDA_GLOBAL void ker_total_forces (storage p_workspace, reax_list p_bonds, 
-            control_params *control,
-            simulation_data *data, 
-            rvec *data_ext_press,
-            int N )
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= N) return;
-
-        int pj;
-        reax_list *bonds = &( p_bonds );
-        storage *workspace = &( p_workspace );
-
-        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
-            //if( i < bonds->select.bond_list[pj].nbr ) {
-            if( control->virial == 0 )
-                Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds);
-            else 
-                Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds, data_ext_press [i]);
-        //}  
-    }
-
-    void Cuda_Total_Forces (reax_system *system, control_params *control, 
-            simulation_data *data, storage *workspace)
-    {
-        int blocks;
-        rvec *spad_rvec = (rvec *) scratch;
-        cuda_memset (spad_rvec, 0, system->N * 2 * sizeof (rvec), "total_forces:ext_press");
-
-        blocks = system->N / DEF_BLOCK_SIZE + 
-            ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-        ker_total_forces <<< blocks, DEF_BLOCK_SIZE >>>
-            ( *dev_workspace, *(*dev_lists + BONDS), 
-              (control_params *) control->d_control_params, 
-              (simulation_data *)data->d_simulation_data, 
-              spad_rvec, system->N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        if (control->virial != 0) 
-        {
-            //do the reduction here for ext press
-            k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> 
-                ( spad_rvec, spad_rvec + system->N, system->N);
-            cudaThreadSynchronize (); 
-            cudaCheckError (); 
-
-            k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
-                ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks);
-            cudaThreadSynchronize (); 
-            cudaCheckError (); 
-        }
-
-        //do the post processing for the atomic forces here
-        ker_total_forces_postprocess  <<< blocks, DEF_BLOCK_SIZE >>>
-            (system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N);
-        cudaThreadSynchronize (); 
-        cudaCheckError (); 
-    }
-
-    CUDA_GLOBAL void ker_total_forces_pure (reax_atom *my_atoms, int n, 
-            storage p_workspace)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= n) return;
-
-        storage *workspace = &( p_workspace );
-
-        rvec_Copy (my_atoms[i].f, workspace->f[i]);
-    }
-
-    void Cuda_Total_Forces_PURE (reax_system *system, storage *workspace)
-    {
-        int blocks;
-
-        blocks = system->n / DEF_BLOCK_SIZE + 
-            ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-        ker_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>>
-            ( system->d_my_atoms, system->n, *dev_workspace);
-        cudaThreadSynchronize (); 
-        cudaCheckError (); 
-    }
diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda_copy.cu
deleted file mode 100644
index 3636497da9c1abb6f68d2d911c469fb18f9e97ea..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_copy.cu
+++ /dev/null
@@ -1,222 +0,0 @@
-#include "cuda_copy.h"
-
-//#include "list.h"
-#include "cuda_utils.h"
-#include "vector.h"
-
-//#ifdef __cplusplus
-//extern "C"  {  
-//#endif
-
-extern "C" int  Make_List( int, int, int, reax_list*);
-extern "C" void Delete_List( reax_list*);
-
-void Sync_Grid (grid *host, grid *device)
-{
-    int total;
-    grid_cell local_cell;
-    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-    ivec_Copy (device->ncells, host->ncells);
-    rvec_Copy (device->cell_len, host->cell_len);
-    rvec_Copy (device->inv_len, host->inv_len);
-
-    ivec_Copy (device->bond_span, host->bond_span );
-    ivec_Copy (device->nonb_span, host->nonb_span );
-    ivec_Copy (device->vlist_span, host->vlist_span );
-
-    ivec_Copy (device->native_cells, host->native_cells );
-    ivec_Copy (device->native_str, host->native_str );
-    ivec_Copy (device->native_end, host->native_end );
-
-    device->ghost_cut = host->ghost_cut;
-    ivec_Copy (device->ghost_span, host->ghost_span );
-    ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
-    ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
-    ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
-
-    copy_host_device (host->str, device->str, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:str");
-    copy_host_device (host->end, device->end, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:end");
-    copy_host_device (host->cutoff, device->cutoff, sizeof (real) * total, cudaMemcpyHostToDevice, "grid:cutoff");
-    copy_host_device (host->nbrs_x, device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
-    copy_host_device (host->nbrs_cp, device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
-
-    copy_host_device (host->rel_box, device->rel_box, sizeof (ivec) * total, cudaMemcpyHostToDevice, "grid:rel_box");
-
-    device->max_nbrs = host->max_nbrs;
-
-    /*
-       for (int i = 0; i < total; i++) {
-
-       copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-cuda_copy");
-
-    //fprintf (stderr, " Atoms address %ld (%d) \n", local_cell.atoms, host->max_atoms );
-    //cuda_memset (local_cell.atoms, 0, sizeof (int) * host->max_atoms, "grid:cell:atoms-memset");
-    //fprintf (stderr, "host native atoms -> %d %d \n", host->native_str[0], host->native_end[0]);
-    //fprintf (stderr, "host atoms -> %d \n", host->cells[i].atoms[i]);
-    //fprintf (stderr, "Host Max atoms : %d \n", host->max_atoms ); 
-    //copy_host_device (host->cells[i].atoms, 
-    //        (local_cell.atoms), sizeof (int) * host->max_atoms, cudaMemcpyHostToDevice, "grid:cell:atoms");
-
-    ////////////////////////////////////////////
-    //No need to copy atoms from the cells from host to device. 
-    // str and end has positions in the d_my_atoms list, which are just indexes into this list
-    // this index is used in the cuda_neighbors to compute the neighbors. 
-    // This is the only place where atoms is used. 
-    ////////////////////////////////////////////////
-
-    //fprintf (stderr, " cells:nbrs_x %ld \n", local_cell.nbrs_x);
-    copy_host_device (host->cells[i].nbrs_x, 
-    local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
-
-    //fprintf (stderr, " Atoms address %ld \n", local_cell.nbrs_cp);
-    copy_host_device (host->cells[i].nbrs_cp, 
-    local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
-
-    //no need to copy pointers for device->cells[i].nbrs. 
-    // we can extract the pointer by nbrs_x (ivec) into the cells array. 
-    // This makes nbrs member redundant on the device
-
-    local_cell.cutoff = host->cells[i].cutoff;
-    rvec_Copy (local_cell.min, host->cells[i].min);
-    rvec_Copy (local_cell.max, host->cells[i].max);
-    ivec_Copy (local_cell.rel_box, host->cells[i].rel_box);
-
-    local_cell.mark = host->cells[i].mark;
-    local_cell.type = host->cells[i].type;
-    local_cell.str = host->cells[i].str;
-    local_cell.end = host->cells[i].end;
-    local_cell.top = host->cells[i].top;
-
-    copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), 
-    cudaMemcpyHostToDevice, "grid:cell-cuda_copy");
-    }
-     */
-}
-
-void Sync_Atoms (reax_system *sys)
-{
-    //TODO
-    //TODO METIN FIX, coredump on his machine
-    //TODO
-    //TODO
-    //copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->total_cap, cudaMemcpyHostToDevice, "system:my_atoms");
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf (stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", 
-            sys->my_rank, sys->n, sys->N, sys->total_cap);
-#endif
-
-    copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->N, cudaMemcpyHostToDevice, "system:my_atoms");
-    //TODO
-    //TODO METIN FIX, coredump on his machine
-    //TODO
-    //TODO
-}
-
-void Sync_System (reax_system *sys)
-{
-    //fprintf (stderr, "p:%d - trying to copy atoms : %d \n", sys->my_rank, sys->local_cap);
-    Sync_Atoms (sys);
-
-    copy_host_device (&(sys->my_box), sys->d_my_box, 
-            sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_box");
-
-    copy_host_device (&(sys->my_ext_box), sys->d_my_ext_box, 
-            sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_ext_box");
-
-    copy_host_device (sys->reax_param.sbp, sys->reax_param.d_sbp, 
-            sizeof (single_body_parameters) * sys->reax_param.num_atom_types, cudaMemcpyHostToDevice, "system:sbp");
-    copy_host_device (sys->reax_param.tbp, sys->reax_param.d_tbp, 
-            sizeof (two_body_parameters) * pow (sys->reax_param.num_atom_types, 2), cudaMemcpyHostToDevice, "system:tbp");
-    copy_host_device (sys->reax_param.thbp, sys->reax_param.d_thbp, 
-            sizeof (three_body_header) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:thbh");
-    copy_host_device (sys->reax_param.hbp, sys->reax_param.d_hbp, 
-            sizeof (hbond_parameters) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:hbond");
-    copy_host_device (sys->reax_param.fbp, sys->reax_param.d_fbp, 
-            sizeof (four_body_header) * pow (sys->reax_param.num_atom_types, 4), cudaMemcpyHostToDevice, "system:four_header");
-
-    copy_host_device (sys->reax_param.gp.l, sys->reax_param.d_gp.l, 
-            sizeof (real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice, "system:global_parameters");
-
-    sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; 
-    sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; 
-}
-
-void Output_Sync_Atoms (reax_system *sys)
-{
-    //TODO changed this from sys->n to sys->N
-    copy_host_device (sys->my_atoms, sys->d_my_atoms, 
-            sizeof (reax_atom) * sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms");
-}
-
-void Output_Sync_Simulation_Data (simulation_data *host, simulation_data *dev)
-{
-    copy_host_device (&host->my_en, &dev->my_en, sizeof (energy_data), 
-            cudaMemcpyDeviceToHost, "simulation_data:energy_data");
-    copy_host_device (&host->kin_press, &dev->kin_press, sizeof (real), 
-            cudaMemcpyDeviceToHost, "simulation_data:kin_press");
-    copy_host_device (host->int_press, dev->int_press, sizeof (rvec), 
-            cudaMemcpyDeviceToHost, "simulation_data:int_press");
-    copy_host_device (host->ext_press, dev->ext_press, sizeof (rvec), 
-            cudaMemcpyDeviceToHost, "simulation_data:ext_press");
-}
-
-void Sync_Workspace (storage *workspace, enum cudaMemcpyKind dir)
-{
-}
-
-void Sync_Matrix (sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
-{
-}
-
-void Sync_Output_Controls (output_controls *, control_params *, enum cudaMemcpyKind)
-{
-}
-
-void Sync_Control (control_params *host, control_params *device, enum cudaMemcpyKind dir)
-{
-}
-
-
-void Prep_Device_For_Output (reax_system *system, simulation_data *data )
-{
-}
-
-void Output_Sync_Lists (reax_list *host, reax_list *device, int type)
-{
-    //fprintf (stderr, " Trying to copy *%d* list from device to host \n", type);
-
-    //list is already allocated -- discard it first
-    //if (host->n > 0)
-    //if (host->allocated > 0)
-    //  Delete_List (host);
-
-    //memory is allocated on the host
-    //Make_List(device->n, device->num_intrs, type, host);
-
-    //memcpy the entries from device to host
-    copy_host_device (host->index, device->index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync_list:list:index");
-    copy_host_device (host->end_index, device->end_index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync:list:end_index");
-
-    switch (type)
-    {   
-        case TYP_BOND:
-            copy_host_device (host->select.bond_list, device->select.bond_list, 
-                    sizeof (bond_data) * device->num_intrs, cudaMemcpyDeviceToHost, "bond_list");
-            break;
-
-        case TYP_THREE_BODY:
-            copy_host_device (host->select.three_body_list, device->select.three_body_list, 
-                    sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, "three_body_list");
-            break;
-
-        default:
-            fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
-            exit (1);
-            break;
-    }  
-}
-
-//#ifdef __cplusplus
-//}
-//#endif
diff --git a/PG-PuReMD/src/cuda_copy.h b/PG-PuReMD/src/cuda_copy.h
deleted file mode 100644
index bea549659000edf26ce897fa7a02ece29befcf63..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_copy.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __CUDA_COPY_H_
-#define __CUDA_COPY_H_
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-void Sync_Atoms (reax_system *);
-void Sync_Grid (grid *, grid *);
-void Sync_System (reax_system *);
-void Sync_Control (control_params *, control_params *, enum cudaMemcpyKind);
-void Sync_Matrix (sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind);
-void Sync_Output_Control (output_controls *, enum cudaMemcpyKind);
-void Sync_Workspace (storage *workspace, enum cudaMemcpyKind);
-
-void Prep_Device_For_Output (reax_system *, simulation_data *);
-void Output_Sync_Lists (reax_list *host, reax_list *device, int type );
-void Output_Sync_Atoms (reax_system *);
-void Output_Sync_Simulation_Data (simulation_data *, simulation_data *);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu
deleted file mode 100644
index cec6c0ebc4b350a3c4b4e4ec33d327993ee09949..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_forces.cu
+++ /dev/null
@@ -1,1675 +0,0 @@
-
-#include "cuda_forces.h"
-
-#include "reax_types.h"
-#include "reax_types.h"
-#include "dev_list.h"
-#include "list.h"
-#include "cuda_utils.h"
-#include "cuda_helpers.h"
-#include "index_utils.h"
-#include "vector.h"
-
-#include "cuda_neighbors.h"
-
-#include "forces.h"
-#include "cuda_bond_orders.h"
-#include "reduction.h"
-#include "cuda_bonds.h"
-#include "cuda_multi_body.h"
-#include "cuda_valence_angles.h"
-#include "cuda_torsion_angles.h"
-#include "cuda_hydrogen_bonds.h"
-#include "tool_box.h"
-#include "cuda_nonbonded.h"
-
-
-//extern "C" real Get_Time( );
-//extern "C" real Get_Timing_Info( real );
-extern "C" int  Make_List( int, int, int, reax_list*);
-extern "C" void Delete_List( reax_list*);
-
-
-CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        two_body_parameters *tbp,
-        control_params *control,
-        reax_list far_nbrs, 
-        int num_atom_types,
-        int n, int N, 
-        int Hcap, int total_cap,
-        int *Htop, int *num_3body,
-        int *bond_top, int *hb_top
-        )
-{
-    int i, j, pj; 
-    int start_i, end_i;
-    int type_i, type_j;
-    int ihb, jhb;
-    int local;
-    real cutoff;
-    real r_ij, r2; 
-    real C12, C34, C56;
-    real BO, BO_s, BO_pi, BO_pi2;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N)
-    {
-        return;
-    }
-
-    //Commented in CUDA_KERNEL
-    //for( i = 0; i < N; ++i ) { 
-    atom_i = &(my_atoms[i]);
-    type_i  = atom_i->type;
-    start_i = Dev_Start_Index(i, &far_nbrs);
-    end_i   = Dev_End_Index(i, &far_nbrs);
-    sbp_i = &(sbp[type_i]);
-
-    if( i < n )
-    { 
-        local = 1;
-        cutoff = control->nonb_cut;
-        //++(*Htop);
-        atomicAdd (Htop, 1);
-        ihb = sbp_i->p_hbond;
-    }   
-    else
-    {
-        local = 0;
-        cutoff = control->bond_cut;
-        ihb = -1; 
-    } 
-
-    for( pj = start_i; pj < end_i; ++pj )
-    { 
-        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &(my_atoms[j]);
-
-        if (nbr_pj->d <= control->nonb_cut)
-        {
-            type_j = my_atoms[j].type;
-            sbp_j = &(sbp[type_j]);
-            ihb = sbp_i->p_hbond;
-            jhb = sbp_j->p_hbond;
-            if ((control->hbond_cut > 0.1) 
-                    && (nbr_pj->d <= control->hbond_cut) 
-                    && (ihb == 2) 
-                    && (jhb == 1) 
-                    && (j < n)
-                    && (i > n)
-               )
-            {
-                atomicAdd (&hb_top [i], 1);
-            }
-
-            if (i >= n)
-            {
-                ihb = -1;
-            }
-        }
-
-
-
-        if(nbr_pj->d <= cutoff)
-        {
-            type_j = my_atoms[j].type;
-            r_ij = nbr_pj->d;
-            sbp_j = &(sbp[type_j]);
-            twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]);
-
-            if( local )
-            {
-                //if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
-                if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
-                {
-                    //++(*Htop);
-                    atomicAdd (Htop, 1);
-                }
-                else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1
-                {
-                    //++(*Htop);
-                    atomicAdd (Htop, 1);
-                }
-
-                if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) &&
-                        nbr_pj->d <= control->hbond_cut 
-                  )
-                {
-                    jhb = sbp_j->p_hbond;
-                    if( (ihb == 1) && (jhb == 2))
-                    {
-                        //++hb_top[i];
-                        atomicAdd (&hb_top[i], 1);
-                    }
-                    //else if( j < n && ihb == 2 && jhb == 1 )
-                    //else if( ihb == 2 && jhb == 1 && j < n)
-                    else if( ihb == 2 && jhb == 1 && j < n)
-                    {
-                        //++hb_top[j];
-                        atomicAdd (&hb_top[i], 1);
-                    }
-                }
-            }
-
-            // uncorrected bond orders 
-            if( nbr_pj->d <= control->bond_cut ) {
-                r2 = SQR(r_ij);
-
-                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                }
-                else BO_s = C12 = 0.0;
-
-                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                    BO_pi = EXP( C34 );
-                }
-                else BO_pi = C34 = 0.0;
-
-                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
-                    BO_pi2= EXP( C56 );
-                }
-                else BO_pi2 = C56 = 0.0;
-
-                // Initially BO values are the uncorrected ones, page 1 
-                BO = BO_s + BO_pi + BO_pi2;
-
-                if( BO >= control->bo_cut ) {
-                    //++bond_top[i];
-                    //++bond_top[j];
-                    atomicAdd (&bond_top [i], 1);
-                    //atomicAdd (&bond_top [j], 1);
-                }
-            }
-        }
-    }
-    //} -- Commented in CUDA_KERNEL
-}
-
-
-CUDA_GLOBAL void ker_init_system_atoms(reax_atom *my_atoms, int N, 
-        int *hb_top, int *bond_top)
-{
-    int i;
-    
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N)
-    {
-        return;
-    }
-
-    my_atoms[i].num_bonds = bond_top [i];
-    my_atoms[i].num_hbonds = hb_top [i];
-}
-
-
-void Cuda_Estimate_Storages(reax_system *system, control_params *control, 
-        reax_list **lists, int local_cap, int total_cap,
-        int *Htop, int *hb_top, 
-        int *bond_top, int *num_3body)
-{
-    int i;
-    int blocks = 0;
-    int *d_Htop, *d_hb_top, *d_bond_top, *d_num_3body;
-    int * tmp = (int*) scratch;
-    int bond_count = 0;
-    int hbond_count = 0;
-    int max_bonds = 0, min_bonds = 999999;
-    int max_hbonds = 0, min_hbonds = 999999;
-
-    *Htop = 0;
-    //memset( hb_top, 0, sizeof(int) * local_cap);
-    memset( hb_top, 0, sizeof(int) * total_cap );
-    memset( bond_top, 0, sizeof(int) * total_cap );
-    *num_3body = 0;
-	
-    //cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages");
-    cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages");
- 
-    d_Htop = tmp; 
-    d_num_3body = d_Htop + 1;
-    d_hb_top = d_num_3body + 1;
-    //d_bond_top = d_hb_top + local_cap;
-    d_bond_top = d_hb_top + total_cap;
-   
-    blocks = (int) CEIL((real)system->N / ST_BLOCK_SIZE);
-
-    ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>>
-        (system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
-         (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
-         system->n, system->N, system->Hcap, system->total_cap, 
-         d_Htop, d_num_3body, d_bond_top, d_hb_top );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( Htop, d_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop");
-    copy_host_device( num_3body, d_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body");
-    //copy_host_device( hb_top, d_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top");
-    copy_host_device( hb_top, d_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top");
-    copy_host_device( bond_top, d_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top");
-
-    for (i = 0; i < system->N; i++)
-    {
-        if (bond_top[i] >= max_bonds)
-        {
-            max_bonds = bond_top[i];
-        }
-        if (bond_top[i] <= min_bonds)
-        {
-            min_bonds = bond_top[i];
-        }
-
-        bond_count += bond_top[i];
-    }
-    system->max_bonds = max_bonds * SAFER_ZONE;
-
-    //for (int i = 0; i < system->n; i++)
-    for (i = 0; i < system->N; i++)
-    {
-        if (hb_top[i] >= max_hbonds)
-        {
-            max_hbonds = hb_top[i];
-        }
-        if (hb_top[i] <= min_hbonds)
-        {
-            min_hbonds = hb_top[i];
-        }
-
-        hbond_count += hb_top [i];
-    }
-    system->max_hbonds = max_hbonds * SAFER_ZONE;
-
-#if defined(DEBUG)
-    fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count);
-    fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count);
-    fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop);
-    fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", 
-            system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds);
-#endif
-
-    ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>>
-        (system->d_my_atoms, system->N, d_hb_top, d_bond_top );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-CUDA_DEVICE real Compute_H( real r, real gamma, real *ctap )
-{
-    real taper, dr3gamij_1, dr3gamij_3;
-
-    taper = ctap[7] * r + ctap[6];
-    taper = taper * r + ctap[5];
-    taper = taper * r + ctap[4];
-    taper = taper * r + ctap[3];
-    taper = taper * r + ctap[2];
-    taper = taper * r + ctap[1];
-    taper = taper * r + ctap[0];    
-
-    dr3gamij_1 = ( r*r*r + gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-    return taper * EV_to_KCALpMOL / dr3gamij_3;
-}
-
-
-CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, int num_atom_types )
-{
-    int r, tmin, tmax;
-    real val, dif, base;
-    LR_lookup_table *t; 
-
-    tmin  = MIN( ti, tj );
-    tmax  = MAX( ti, tj );
-    t = &( t_LR[index_lr (tmin,tmax, num_atom_types)] );    
-
-    /* cubic spline interpolation */
-    r = (int)(r_ij * t->inv_dx);
-    if( r == 0 )  ++r;
-    base = (real)(r+1) * t->dx;
-    dif = r_ij - base;
-    val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-        t->ele[r].a;
-    val *= EV_to_KCALpMOL / C_ele;
-
-    return val;
-}
-
-
-CUDA_GLOBAL void ker_estimate_sparse_matrix (reax_atom *my_atoms, control_params *control, 
-        reax_list p_far_nbrs, int n, int N, int renbr, int *indices)
-{
-    int i, j, pj;
-    int start_i, end_i;
-    int flag;
-    real cutoff;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-    reax_list *far_nbrs = &( p_far_nbrs );
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    atom_i = &(my_atoms[i]);
-    start_i = Dev_Start_Index(i, far_nbrs);
-    end_i   = Dev_End_Index(i, far_nbrs);
-
-    cutoff = control->nonb_cut;
-
-    //++Htop;
-    if ( i < n) 
-        indices [i] ++;
-
-    /* update i-j distance - check if j is within cutoff */
-    for( pj = start_i; pj < end_i; ++pj ) {
-        nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &(my_atoms[j]);
-        if( renbr ) {
-            if(nbr_pj->d <= cutoff)
-                flag = 1;
-            else flag = 0;
-        }
-        else {
-            if (i < j) {
-                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-            } else {
-                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
-                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
-                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
-            }
-            nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-            //TODO
-            //TODO
-            //TODO
-            //if( nbr_pj->d <= (cutoff) ) {
-            if( nbr_pj->d <= SQR(cutoff) )
-            {
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-            else
-            {
-                flag = 0;
-            }
-        }
-
-        if( flag )
-        {
-            /* H matrix entry */
-            //if( j < n || atom_i->orig_id < atom_j->orig_id )
-            //++Htop;
-            //    indices [i] ++;
-            //else if (j < n || atom_i->orig_id > atom_j->orig_id )
-            //    indices [i] ++;
-
-            //if ((i < n) || (j < n))
-            //    indices [i] ++;
-            //if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id))
-            //    indices [i] ++;
-            //if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id)
-            //    indices [i] ++;
-            //else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id)))
-            //    indices [i] ++;
-            //THIS IS THE HOST CONDITION
-            //if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id ))
-            //if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n)
-            //    indices [i] ++;
-            //THIS IS THE DEVICE CONDITION
-            //if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-            //    indices [i] ++;
-
-            //this is the working condition
-            if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
-                indices [i]++;
-            else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-                indices [i] ++;
-            else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
-                indices [i] ++;
-        }
-    }
-}
-
-
-int Cuda_Estimate_Sparse_Matrix (reax_system *system, control_params *control, 
-        simulation_data *data, reax_list **lists)
-{
-    int blocks, max_sp_entries;
-    int *indices = (int *) scratch;
-    int *h_indices = (int *) host_scratch;
-    int total_sparse = 0;
-
-    cuda_memset (indices, 0, sizeof (int) * system->N, "sp_matrix:indices");
-
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    //TODO
-    //TODO
-    //TODO
-    //TODO
-    ker_estimate_sparse_matrix  <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, (control_params *)control->d_control_params, 
-         *(*dev_lists + FAR_NBRS), system->n, system->N, 
-         (((data->step-data->prev_steps) % control->reneighbor) == 0), indices);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    copy_host_device (h_indices, indices, sizeof (int) * system->N, 
-            cudaMemcpyDeviceToHost, "sp_matrix:indices");
-    max_sp_entries = 0;    
-    for (int i = 0; i < system->N; i++){
-        total_sparse += h_indices [i];
-        if (max_sp_entries < h_indices[i])
-            max_sp_entries = h_indices[i];
-    }
-
-    //fprintf (stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", total_sparse );
-    //fprintf (stderr, "p%d: Max sparse entries -> %d \n", system->my_rank, max_sp_entries );
-    system->max_sparse_entries = max_sp_entries * SAFE_ZONE;
-
-    return SUCCESS;
-}
-
-
-CUDA_GLOBAL void ker_init_forces (reax_atom *my_atoms, single_body_parameters *sbp, 
-        two_body_parameters *tbp, storage workspace, 
-        control_params *control, 
-        reax_list far_nbrs, reax_list bonds, reax_list hbonds, 
-        LR_lookup_table *t_LR, 
-        int n, int N, int num_atom_types, 
-        int max_sparse_entries, int renbr, 
-        int max_bonds, int max_hbonds)
-{
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int Htop;
-    int btop_i, ihb, jhb, ihb_top;
-    //int btop_j, jhb, jhb_top;
-    int local, flag, flag2, flag3;
-    real r_ij, cutoff;
-    //reax_list *far_nbrs, *bonds, *hbonds;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-    sparse_matrix *H = &(workspace.H);
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    Htop = i * max_sparse_entries;
-    btop_i = 0;
-
-    //Commented for CUDA KERNEL
-    //for( i = 0; i < system->N; ++i ) {
-    atom_i = &(my_atoms[i]);
-    type_i  = atom_i->type;
-    start_i = Dev_Start_Index(i, &far_nbrs);
-    end_i   = Dev_End_Index(i, &far_nbrs);
-    //CHANGE ORIGINAL
-    //btop_i = Dev_Start_Index( i, &bonds );
-    btop_i = i * max_bonds;
-    Dev_Set_Start_Index (i, btop_i, &bonds);
-    //CHANGE ORIGINAL
-
-    sbp_i = &(sbp[type_i]);
-
-    if( i < n ) {
-        local = 1;
-        cutoff = control->nonb_cut;
-
-        //update bond mark here
-        workspace.bond_mark [i] = 0;
-
-    }
-    else {
-        local = 0;
-        cutoff = control->bond_cut;
-
-        //update bond mark here
-        workspace.bond_mark [i] = 1000;
-    }
-
-    ihb = -1;
-    ihb_top = -1;
-    //CHANGE ORIGINAL
-    H->start[i] = Htop;
-
-    if( local ) {
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = sbp_i->eta;
-        ++Htop;
-    }
-    //CHANGE ORIGINAL
-
-    if( control->hbond_cut > 0 ) {
-        ihb = sbp_i->p_hbond;
-        //CHANGE ORIGINAL
-        if( ihb == 1  || ihb == 2) {
-            //CHANGE ORIGINAL
-            //ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds );
-            ihb_top = i * max_hbonds;
-            Dev_Set_Start_Index (atom_i->Hindex, ihb_top, &hbonds );
-        }
-        else ihb_top = -1;
-    }
-
-    /* update i-j distance - check if j is within cutoff */
-    for( pj = start_i; pj < end_i; ++pj ) {
-        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &(my_atoms[j]);
-        if( renbr ) {
-            if(nbr_pj->d <= cutoff)
-                flag = 1;
-            else flag = 0;
-
-            if(nbr_pj->d <= control->nonb_cut)
-                flag2 = 1;
-            else flag2 = 0;
-
-        }
-        else{
-            if (i < j) {
-                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-            } else {
-                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
-                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
-                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
-                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-            }
-
-            if(nbr_pj->d <= SQR (control->nonb_cut))
-                flag2 = 1;
-            else flag2 = 0;
-
-            //if( nbr_pj->d <= SQR(cutoff) ) {
-            if( nbr_pj->d <= SQR(control->nonb_cut) ) {
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-            else {
-                flag = 0;
-            }
-        }
-        if (flag2) {
-            ihb = sbp_i->p_hbond;
-            type_j = atom_j->type;
-            sbp_j = &(sbp[type_j]);
-            jhb = sbp_j->p_hbond;
-            if( control->hbond_cut > 0 
-                    && nbr_pj->d <= control->hbond_cut
-                    && (ihb == 2)
-                    && (jhb == 1)
-                    && (i >= n)
-                    && (j < n)
-              ) 
-            {
-                hbonds.select.hbond_list[ihb_top].nbr = j;
-                hbonds.select.hbond_list[ihb_top].scl = -1;
-                hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-                //CUDA SPECIFIC
-                hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-                ++ihb_top;
-            }
-
-            //if ((i < n) || (j < n))
-            //if (local || ((i >= n) &&(j < n)))
-
-            flag3 = false;
-            if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
-                flag3 = true;
-            else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-                flag3 = true;
-            else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
-                flag3 = true;
-
-            if (flag3)
-            {
-                twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
-                r_ij = nbr_pj->d;
-
-                //if (renbr) {
-                H->entries[Htop].j = j;
-                if( control->tabulate == 0 )
-                    H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-                else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-                //}
-                ++Htop;
-            }
-        }
-
-        if( flag ){
-            type_j = atom_j->type;
-            r_ij = nbr_pj->d;
-            sbp_j = &(sbp[type_j]);
-            twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
-
-            if( local ) {
-                /* H matrix entry */
-                /*
-                   if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1
-                   H->entries[Htop].j = j;
-                   if( control->tabulate == 0 )
-                   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-                   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-                   ++Htop;
-                   } 
-                   else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1
-                   H->entries[Htop].j = j;
-                   if( control->tabulate == 0 )
-                   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-                   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-                   ++Htop;
-                   } 
-                 */
-
-                //bool condition = !((i >= n) && (j >= n));
-                /* hydrogen bond lists */
-                if( control->hbond_cut > 0 && (ihb==1 || ihb==2) &&
-                        nbr_pj->d <= control->hbond_cut // && i < j
-                  ) {
-                    jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
-                        if (i < j) 
-                            hbonds.select.hbond_list[ihb_top].scl = 1;
-                        else
-                            hbonds.select.hbond_list[ihb_top].scl = -1;
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-                        //CUDA SPECIFIC
-                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-
-                        ++ihb_top;
-                    }
-                    //else if( j < n && ihb == 2 && jhb == 1 ) 
-                    else if( ihb == 2 && jhb == 1 && j < n) {
-                        //jhb_top = End_Index( atom_j->Hindex, hbonds );
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
-                        hbonds.select.hbond_list[ihb_top].scl = -1;
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-                        //CUDA SPECIFIC
-                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-                        ++ihb_top;
-
-                        //Set_End_Index( atom_j->Hindex, jhb_top+1, hbonds );
-                        //++num_hbonds;
-                    }
-                }
-            }
-
-
-
-            /* uncorrected bond orders */
-            if( nbr_pj->d <= control->bond_cut 
-                    && Dev_BOp( bonds, control->bo_cut, 
-                        i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, 
-                        workspace.dDeltap_self, workspace.total_bond_order) 
-              ) {
-                //num_bonds += 2;
-                ++btop_i;
-
-                /* Need to do later... since i and j are parallel
-                   if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-                   workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-                   else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) {
-                   workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                   }
-                 */
-            }
-        }
-        }
-
-        Dev_Set_End_Index( i, btop_i, &bonds );
-        //    if( local ) {
-        H->end[i] = Htop;
-        //   }
-        //CHANGE ORIGINAL
-        if(( ihb == 1 || ihb == 2 ) && (ihb_top > 0) && (control->hbond_cut > 0))
-            Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds );
-        //} Commented for cuda kernel
-}
-
-
-CUDA_GLOBAL void ker_init_bond_mark (int offset, int n, int *bond_mark)
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
-
-    bond_mark [offset + threadIdx.x] = 1000;
-}
-
-
-CUDA_GLOBAL void New_fix_sym_dbond_indices (reax_list pbonds, int N)
-{
-    int i, nbr;
-    bond_data *ibond, *jbond;
-    int atom_j;
-
-    reax_list *bonds = &pbonds;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    for (int j = Dev_Start_Index (i, bonds); j < Dev_End_Index (i, bonds); j++)
-    {
-        ibond = &( bonds->select.bond_list [j] );
-        nbr = ibond->nbr;
-
-        for (int k = Dev_Start_Index (nbr, bonds); k < Dev_End_Index (nbr, bonds); k ++)
-        {
-            jbond = &( bonds->select.bond_list[ k ] );
-            atom_j = jbond->nbr;
-
-            if ( (atom_j == i) )
-            {
-                if (i > nbr) {
-                    ibond->dbond_index = j;
-                    jbond->dbond_index = j;
-
-                    ibond->sym_index = k;
-                    jbond->sym_index = j;
-                }
-            }
-        }
-    }
-}
-
-
-CUDA_GLOBAL void New_fix_sym_hbond_indices (reax_atom *my_atoms, reax_list hbonds, int N )
-{
-
-    hbond_data *ihbond, *jhbond;
-
-    int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warp_id = thread_id / __THREADS_PER_ATOM__;
-    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
-    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-    if (warp_id > N) return;
-
-    int i = warp_id;
-    int nbr;
-    int k;
-    int start = Dev_Start_Index (my_atoms[i].Hindex, &hbonds);
-    int end = Dev_End_Index (my_atoms[i].Hindex, &hbonds);
-    int j = start + lane_id;
-    while (j < end)
-    {
-        ihbond = &( hbonds.select.hbond_list [j] );
-        nbr = ihbond->nbr;
-
-        int nbrstart = Dev_Start_Index (my_atoms[nbr].Hindex, &hbonds);
-        int nbrend = Dev_End_Index (my_atoms[nbr].Hindex, &hbonds);
-
-        for (k = nbrstart; k < nbrend; k++)
-        {
-            jhbond = &( hbonds.select.hbond_list [k] );
-
-            if (jhbond->nbr == i){
-                ihbond->sym_index = k;
-                jhbond->sym_index = j;
-                break;
-            }
-        }
-
-        j += __THREADS_PER_ATOM__;
-    }
-}
-
-
-////////////////////////
-// HBOND ISSUE
-CUDA_GLOBAL void ker_update_bonds (reax_atom *my_atoms, 
-        reax_list bonds, 
-        int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
-
-    my_atoms [i].num_bonds = 
-        MAX(Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS);
-}
-
-
-CUDA_GLOBAL void ker_update_hbonds (reax_atom *my_atoms, 
-        reax_list hbonds,
-        int n)
-{
-    int Hindex;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
-
-    Hindex = my_atoms[i].Hindex;
-    my_atoms [i].num_hbonds = 
-        MAX(Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS);
-}
-////////////////////////
-////////////////////////
-////////////////////////
-
-
-int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control, 
-        int step, int n, int N, int numH )
-{
-    int blocks;
-    int i, comp, Hindex;
-    int *index, *end_index;
-    reax_list *bonds, *hbonds;
-    reax_atom *my_atoms;
-    reallocate_data *realloc;
-    realloc = &( dev_workspace->realloc);
-
-    int max_sp_entries, num_hbonds, num_bonds;
-    int total_sp_entries;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, *(*lists + BONDS), 
-         system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    ////////////////////////
-    // HBOND ISSUE
-    //FIX - 4 - Added this check for hydrogen bond issue
-    if ((control->hbond_cut > 0) && (system->numH > 0)){
-        ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
-            (system->d_my_atoms, *(*lists + HBONDS), 
-             system->n);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-
-    //validate sparse matrix entries.
-    memset (host_scratch, 0, 2 * system->N * sizeof (int));    
-    index = (int *) host_scratch;
-    end_index = index + system->N;
-    copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int), 
-            cudaMemcpyDeviceToHost, "sparse_matrix:start" );
-    copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int), 
-            cudaMemcpyDeviceToHost, "sparse_matrix:end" );
-    max_sp_entries = total_sp_entries = 0;
-    for (i = 0; i < N; i++ ){
-        //if (i < N-1)
-        //    comp = index [i+1];
-        //else
-        //    comp = dev_workspace->H.m;
-
-        total_sp_entries += end_index [i] - index[i];
-        if (end_index [i] - index[i] > system->max_sparse_entries) {
-            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
-                    step, i, index[i], end_index[i] );
-            return FAILURE;
-        } else if (end_index[i] >= dev_workspace->H.m) {
-            //SUDHIR_FIX_SPARSE_MATRIX
-            //TODO move this carver
-            //TODO move this carver
-            //TODO move this carver
-            fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", 
-                    system->my_rank, step, i, index[i], end_index[i]);    
-            //TODO move this carver
-            //TODO move this carver
-            //TODO move this carver
-            return FAILURE;
-        } else {
-            if (max_sp_entries <= end_index[i] - index [i])
-                max_sp_entries = end_index[i] - index [i];
-        }
-    }
-    //if (max_sp_entries <= end_index[i] - index [i])
-    //    max_sp_entries = end_index[i] - index [i];
-
-    //update the current step max_sp_entries;
-    realloc->Htop = max_sp_entries;
-
-#if defined(DEBUG)
-    fprintf (stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", 
-            system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries);
-#endif
-
-    if (total_sp_entries >= dev_workspace->H.m) {
-        fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", 
-                system->my_rank, step, dev_workspace->H.m, total_sp_entries);
-
-        return FAILURE;
-    }
-
-
-    //validate Bond list
-    if (N > 0) {
-        num_bonds = 0;
-
-        bonds = *lists + BONDS;
-        memset (host_scratch, 0, 2 * bonds->n * sizeof (int));    
-        index = (int *) host_scratch;
-        end_index = index + bonds->n;
-
-        copy_host_device (index, bonds->index, bonds->n * sizeof (int), 
-                cudaMemcpyDeviceToHost, "bonds:index");
-        copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int), 
-                cudaMemcpyDeviceToHost, "bonds:end_index");
-
-        /*
-           for (i = 0; i < N; i++) {
-           if (i < N-1)
-           comp = index [i+1];
-           else
-           comp = bonds->num_intrs;
-
-           if (end_index [i] > comp) {
-           fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n",
-           step, i, index[i], end_index[i], comp );
-           return FAILURE;
-           }
-
-           num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS);
-           }
-
-           if (end_index[N-1] >= bonds->num_intrs) {
-           fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n",
-           step, index[N-1], end_index[N-1], bonds->num_intrs);
-           return FAILURE;
-           }
-           num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS );
-        //check the condition for reallocation
-        realloc->num_bonds = num_bonds;
-         */
-
-        int max_bonds = 0;
-        for (i = 0; i < N; i++) {
-            if (end_index[i] - index[i] >= system->max_bonds) {
-                fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n",
-                        step, i, index[i], end_index[i], system->max_bonds);
-                return FAILURE;
-            }
-            if (end_index[i] - index[i] >= max_bonds)
-                max_bonds = end_index[i] - index[i];
-        }
-        realloc->num_bonds = max_bonds;
-
-    }
-
-    //validate Hbonds list
-    num_hbonds = 0;
-    // FIX - 4 - added additional check here
-    if ((numH > 0) && (control->hbond_cut > 0)) {
-        hbonds = *lists + HBONDS;
-        memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N);    
-        index = (int *) host_scratch;
-        end_index = index + hbonds->n;
-        my_atoms = (reax_atom *)(end_index + hbonds->n);
-
-        copy_host_device (index, hbonds->index, hbonds->n * sizeof (int), 
-                cudaMemcpyDeviceToHost, "hbonds:index");
-        copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int), 
-                cudaMemcpyDeviceToHost, "hbonds:end_index");
-        copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom), 
-                cudaMemcpyDeviceToHost, "system:d_my_atoms");
-
-        //fprintf (stderr, " Total local atoms: %d \n", n);
-
-        /*
-           for (i = 0; i < N-1; i++) {
-           Hindex = my_atoms [i].Hindex;
-           if (Hindex > -1) 
-           comp = index [Hindex + 1];
-           else
-           comp = hbonds->num_intrs;
-
-           if (end_index [Hindex] > comp) {
-           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n",
-           step, i, Hindex, index[Hindex], end_index[Hindex], comp );
-           return FAILURE;
-           }
-
-           num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2);
-           }
-           if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) {
-           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n",
-           step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs);
-           return FAILURE;
-           }
-
-           num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2, 
-           2 * MIN_HBONDS);
-           num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS );
-           realloc->num_hbonds = num_hbonds;
-         */
-
-        int max_hbonds = 0;
-        for (i = 0; i < N; i++) {
-            if (end_index[i] - index[i] >= system->max_hbonds) {
-                fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n",
-                        step, i, index[i], end_index[i], system->max_hbonds);
-                return FAILURE;
-            }
-            if (end_index[i] - index[i] >= max_hbonds)
-                max_hbonds = end_index[i] - index[i];
-        }
-        realloc->num_hbonds = max_hbonds;
-    }
-
-    return SUCCESS;
-}
-
-
-CUDA_GLOBAL void ker_init_bond_orders (reax_atom *my_atoms, 
-        reax_list far_nbrs, 
-        reax_list bonds, 
-        real *total_bond_order, 
-        int N)
-{
-    int i, j, pj; 
-    int start_i, end_i;
-    int type_i, type_j;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    atom_i = &(my_atoms[i]);
-    start_i = Dev_Start_Index(i, &far_nbrs);
-    end_i   = Dev_End_Index(i, &far_nbrs);
-
-    for( pj = start_i; pj < end_i; ++pj ) { 
-        // nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-        // j = nbr_pj->nbr;
-        // atom_j = &(my_atoms[j]);
-
-        //total_bond_order [i] ++;
-        //atom_i->Hindex ++;
-    }
-}
-
-
-CUDA_GLOBAL void ker_bond_mark (reax_list p_bonds, storage p_workspace, int N)
-{
-    reax_list *bonds = &( p_bonds );
-    storage *workspace = &( p_workspace );
-    int j;
-
-    //int i = blockIdx.x * blockDim.x + threadIdx.x;
-    //if (i >= N) return;
-
-    for (int i = 0; i < N; i++) 
-        for (int k = Dev_Start_Index (i, bonds); k < Dev_End_Index (i, bonds); k++)
-        {
-            bond_data *bdata = &( bonds->select.bond_list [k] );
-            j = bdata->nbr;
-
-            if (i < j ) {
-                if ( workspace->bond_mark [j] > (workspace->bond_mark [i] + 1) )
-                    workspace->bond_mark [j] = workspace->bond_mark [i] + 1;    
-                else if ( workspace->bond_mark [i] > (workspace->bond_mark [j] + 1) )
-                    workspace->bond_mark [i] = workspace->bond_mark [j] + 1;
-            }
-        }
-}
-
-
-int Cuda_Init_Forces( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, output_controls *out_control ) 
-{
-    int init_blocks;
-    int hblocks;
-
-    //init the workspace (bond_mark)
-    /*
-       int blocks;
-       cuda_memset (dev_workspace->bond_mark, 0, sizeof (int) * system->n, "bond_mark");
-
-       blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
-       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-       ker_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
-       (system->n, (system->N - system->n), dev_workspace->bond_mark);
-       cudaThreadSynchronize ();
-       cudaCheckError ();
-     */
-    //validate total_bond_orders
-
-    //main kernel
-    init_blocks = (system->N) / DEF_BLOCK_SIZE + 
-        (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
-    //fprintf (stderr, " Total atoms: %d, blocks: %d \n", system->N, init_blocks );
-
-    //    ker_init_bond_orders <<<init_blocks, DEF_BLOCK_SIZE >>>
-    //            ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), 
-    //                dev_workspace->total_bond_order, system->N);
-    //    cudaThreadSynchronize ();
-    //    cudaCheckError ();
-    //    fprintf (stderr, " DONE WITH VALIDATION \n");
-
-    ker_init_forces <<<init_blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, 
-         system->reax_param.d_tbp, *dev_workspace, 
-         (control_params *)control->d_control_params, 
-         *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), *(*dev_lists + HBONDS), 
-         d_LR, system->n, system->N, system->reax_param.num_atom_types, 
-         //system->max_sparse_entries, ((data->step-data->prev_steps) % control->reneighbor));
-        system->max_sparse_entries, (((data->step-data->prev_steps) % control->reneighbor) == 0), 
-        system->max_bonds, system->max_hbonds);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-
-    //fix - sym_index and dbond_index
-    New_fix_sym_dbond_indices <<<init_blocks, BLOCK_SIZE>>> 
-        (*(*dev_lists + BONDS), system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    ///////////////////////
-    ///////////////////////
-    // FIX - 4 - HBOND ISSUE
-    if ((control->hbond_cut > 0 ) && (system->numH > 0))
-    {
-        //make hbond_list symmetric
-        hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM) / HB_SYM_BLOCK_SIZE + 
-            ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);
-        //New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
-        New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
-            (system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-
-    //update bond_mark
-    //ker_bond_mark <<< init_blocks, DEF_BLOCK_SIZE>>>
-    /*
-       ker_bond_mark <<< 1, 1>>>
-       ( *(*dev_lists + BONDS), *dev_workspace, system->N);
-       cudaThreadSynchronize ();
-       cudaCheckError ();
-     */
-
-    //TODO
-    //1. update the sparse matrix count for reallocation
-    //2. update the bonds count for reallocation
-    //3. update the hydrogen bonds count for reallocation
-
-    //Validate lists here.
-    return Cuda_Validate_Lists (system, workspace, dev_lists, control, 
-            data->step, system->n, system->N, system->numH );
-}
-
-
-int Cuda_Init_Forces_noQEq( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, output_controls *out_control ) 
-{
-    //TODO Implement later
-    // when you figure out the bond_mark usage.
-
-    return FAILURE;
-}
-
-
-int Cuda_Compute_Bonded_Forces (reax_system *system, control_params *control, 
-        simulation_data *data, storage *workspace, 
-        reax_list **lists, output_controls *out_control )
-{
-    real t_start, t_elapsed;
-    real *spad = (real *) scratch;
-    rvec *rvec_spad;
-
-    //1. Bond Order Interactions. - bond_orders.c
-    t_start = Get_Time( );
-    //fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE);
-    Cuda_Calculate_BO_init  <<< BLOCKS_N, BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, 
-          *dev_workspace, 
-          system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
-          system->reax_param.d_tbp, *dev_workspace, 
-          *(*dev_lists + BONDS),
-          system->reax_param.num_atom_types, system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-
-    Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-        (*dev_workspace, *(*dev_lists + BONDS), system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-        (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
-         *dev_workspace, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    t_elapsed = Get_Timing_Info( t_start );
-    //fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-    //fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
-
-    //2. Bond Energy Interactions. - bonds.c
-    t_start = Get_Time( );
-    cuda_memset (spad, 0, system->N * ( 2 * sizeof (real)) , "scratch");
-
-    Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof (real)* BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
-          *dev_workspace, *(*dev_lists + BONDS), 
-          system->n, system->reax_param.num_atom_types, spad );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_BE
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-        (spad, spad + system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> 
-        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    t_elapsed = Get_Timing_Info( t_start );
-    //fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-    //fprintf (stderr, "Cuda_Bond_Energy Done... \n");
-
-
-    //3. Atom Energy Interactions. 
-    t_start = Get_Time( );
-    cuda_memset (spad, 0, ( 6 * sizeof (real) * system->n ), "scratch");
-
-    Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, 
-            system->reax_param.d_sbp, system->reax_param.d_tbp, 
-            *dev_workspace, 
-            *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, 
-            spad, spad + 2 * system->n, spad + 4*system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //CHANGE ORIGINAL
-    //Cuda_Atom_Energy_PostProcess     <<<BLOCKS, BLOCK_SIZE >>>
-    //                    ( *(*dev_lists + BONDS), *dev_workspace, system->n );
-    Cuda_Atom_Energy_PostProcess     <<<BLOCKS_N, BLOCK_SIZE >>>
-        ( *(*dev_lists + BONDS), *dev_workspace, system->N );
-    //CHANGE ORIGINAL
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Lp
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-        (spad, spad + system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Ov
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-        (spad + 2*system->n, spad + 3*system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-        (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_ov, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Un
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-        (spad + 4*system->n, spad + 5*system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-        (spad + 5*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_un, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    t_elapsed = Get_Timing_Info( t_start );
-    //fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-    //fprintf (stderr, "test_LonePair_postprocess Done... \n");
-
-
-    //4. Valence Angles Interactions. 
-    t_start = Get_Time( );
-
-    //THREE BODY CHANGES HERE
-    cuda_memset(spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof (int), "scratch");
-    Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>>
-        (system->d_my_atoms, 
-         (control_params *)control->d_control_params, 
-         *(*dev_lists + BONDS),
-         system->n, system->N, (int *)spad);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-
-    int *thbody = (int *) host_scratch;
-    memset (thbody, 0, sizeof (int) * (*dev_lists + BONDS)->num_intrs);
-    copy_host_device (thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, "thb:offsets");
-
-    int total_3body = thbody [0] * SAFE_ZONE;
-    for (int x = 1; x < (*dev_lists + BONDS)->num_intrs; x++) {
-        total_3body += thbody [x]*SAFE_ZONE;
-        thbody [x] += thbody [x-1];
-    }
-
-    system->num_thbodies = thbody [(*dev_lists+BONDS)->num_intrs-1];
-    if (!system->init_thblist) 
-    {
-        system->init_thblist = true;
-        if(!Dev_Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*dev_lists + THREE_BODIES))) {
-            fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }    
-        if(!Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*lists + THREE_BODIES))) {
-            fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }    
-#ifdef __CUDA_MEM__
-        fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
-                sizeof (three_body_interaction_data) * total_3body / (1024*1024));
-#endif
-    } else {
-        //if (((dev_workspace->realloc.num_bonds * DANGER_ZONE) >= (*dev_lists+BONDS)->num_intrs) || 
-        //        (system->num_thbodies > (*dev_lists+THREE_BODIES)->num_intrs )) { 
-        //int size = dev_workspace->realloc.num_bonds;
-        if ((system->num_thbodies >= (*dev_lists+THREE_BODIES)->num_intrs ) || 
-                ((*dev_lists+THREE_BODIES)->n < (*dev_lists+BONDS)->num_intrs) ) {
-
-            int size = (*dev_lists + BONDS)->num_intrs;
-
-            /*Delete Three-body list*/
-            Dev_Delete_List( *dev_lists + THREE_BODIES );
-            Delete_List ( *lists + THREE_BODIES );
-
-            fprintf (stderr, "p%d ***** Reallocating the Three-body list threebody.n: %d, bonds.num_intrs: %d, num_thb: %d, thb_entries: %d \n", 
-                    system->my_rank, (*dev_lists+THREE_BODIES)->n, (*dev_lists+BONDS)->num_intrs, 
-                    system->num_thbodies, (*dev_lists+THREE_BODIES)->num_intrs);
-#ifdef __CUDA_MEM__
-            fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
-                    data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
-#endif
-            /*Recreate Three-body list */
-            if(!Dev_Make_List(size, total_3body, TYP_THREE_BODY, *dev_lists + THREE_BODIES )) {
-                fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-            }
-            if(!Make_List(size, total_3body, TYP_THREE_BODY, *lists + THREE_BODIES )) {
-                fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
-                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-            }
-        }
-    }
-
-    //copy the indexes into the thb list;
-    copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
-            cudaMemcpyHostToDevice, "thb:index");
-    copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
-            cudaMemcpyHostToDevice, "thb:end_index");
-    //THREE_BODY CHANGES HERE
-
-
-    cuda_memset (spad, 0, ( 6 * sizeof (real) * system->N + sizeof (rvec) * system->N * 2), "scratch");
-    Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
-        ( system->d_my_atoms,
-          system->reax_param.d_gp, 
-          system->reax_param.d_sbp, system->reax_param.d_thbp, 
-          (control_params *)control->d_control_params,
-          *dev_workspace, 
-          *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
-          system->n, system->N, system->reax_param.num_atom_types, 
-          spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Ang
-    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-        (spad, spad + system->N,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-        (spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, BLOCKS_N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Pen
-    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (spad + 2*system->N, spad + 3*system->N,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_pen, BLOCKS_N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Coa
-    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (spad + 4*system->N, spad + 5*system->N,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-        (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_coa, BLOCKS_N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for ext_pres
-    rvec_spad = (rvec *) (spad + 6*system->N);
-    k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-        (rvec_spad, rvec_spad + system->N,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N >>>
-        (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
-        (  system->d_my_atoms,
-           (control_params *)control->d_control_params,
-           *dev_workspace,
-           *(*dev_lists + BONDS),
-           system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    t_elapsed = Get_Timing_Info( t_start );
-    //fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
-    //fprintf (stderr, "Three_Body_Interactions Done... \n");
-
-
-    //5. Torsion Angles Interactions. 
-    t_start = Get_Time( );
-    cuda_memset (spad, 0, ( 4 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2), "scratch");
-    Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
-        ( system->d_my_atoms,
-          system->reax_param.d_gp,
-          system->reax_param.d_fbp,
-          (control_params *)control->d_control_params,
-          *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
-          *dev_workspace,
-          system->n, system->reax_param.num_atom_types, 
-          spad, spad + 2*system->n, (rvec *) (spad + 4*system->n));
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Tor
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (spad, spad + system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for E_Con
-    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (spad + 2*system->n, spad + 3*system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-        (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_con, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for ext_pres
-    rvec_spad = (rvec *) (spad + 4*system->n);
-    k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-        (rvec_spad, rvec_spad + system->n,  system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
-        (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Post process here
-    Cuda_Torsion_Angles_PostProcess   <<< BLOCKS_N, BLOCK_SIZE >>>
-        (  system->d_my_atoms,
-           *dev_workspace,
-           *(*dev_lists + BONDS),
-           system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    t_elapsed = Get_Timing_Info( t_start );
-    //fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
-    //fprintf (stderr, " Four_Body_ Done... \n");
-
-
-    //6. Hydrogen Bonds Interactions.
-    // FIX - 4 - Added additional check here
-    if ((control->hbond_cut > 0) && (system->numH > 0)) {
-
-        t_start = Get_Time( );
-        cuda_memset (spad, 0, ( 2 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2 ), "scratch");
-
-
-        int hbs = ((system->n * HB_KER_THREADS_PER_ATOM)/ HB_BLOCK_SIZE) + 
-            (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
-        Cuda_Hydrogen_Bonds_MT <<<hbs, HB_BLOCK_SIZE, 
-                               HB_BLOCK_SIZE * (2 * sizeof (real) + 2 * sizeof (rvec)) >>>
-                                   //Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE>>>
-                                   (  system->d_my_atoms,
-                                      system->reax_param.d_sbp,
-                                      system->reax_param.d_hbp,
-                                      system->reax_param.d_gp,
-                                      (control_params *)control->d_control_params,
-                                      *dev_workspace,
-                                      *(*dev_lists + BONDS), *(*dev_lists + HBONDS),
-                                      system->n, system->reax_param.num_atom_types,
-                                      spad, (rvec *) (spad + 2*system->n));
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        //Reduction for E_HB
-        k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-            (spad, spad + system->n,  system->n);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-            (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, BLOCKS);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-
-        //Reduction for ext_pres
-        rvec_spad = (rvec *) (spad + 2*system->n);
-        k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-            (rvec_spad, rvec_spad + system->n,  system->n);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
-            (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        ////post process step1:
-        Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof (rvec) >>>
-            (  system->d_my_atoms,
-               *dev_workspace,
-               *(*dev_lists + BONDS),
-               system->N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        ////post process step2:
-        /*
-           Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof (rvec)>>>
-           (  system->d_my_atoms,
-         *dev_workspace,
-         *(*dev_lists + HBONDS));
-         */
-        int hnbrs_bl = ((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM)/ HB_POST_PROC_BLOCK_SIZE) + 
-            (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
-        Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_bl, HB_POST_PROC_BLOCK_SIZE, 
-                                     HB_POST_PROC_BLOCK_SIZE * sizeof (rvec)>>>
-                                         (  system->d_my_atoms,
-                                            *dev_workspace,
-                                            *(*dev_lists + HBONDS), system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-        t_elapsed = Get_Timing_Info( t_start );
-        //fprintf (stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
-        //fprintf (stderr, "Hydrogen_Bond Done... \n");    
-    }
-
-    return SUCCESS;
-}
-
-
-void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, storage *workspace, 
-        reax_list **lists, output_controls *out_control,
-        mpi_datatypes *mpi_data )
-{
-    /* van der Waals and Coulomb interactions */
-    Cuda_NonBonded_Energy( system, control, workspace, data,
-            lists, out_control, (control->tabulate == 0) ? false: true);
-}
diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda_forces.h
deleted file mode 100644
index edf9bb1e87c3f419233e8db06645d0aec77f92bb..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_forces.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-#ifndef __CUDA_FORCES_H__
-#define __CUDA_FORCES_H__
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void Cuda_Estimate_Storages (reax_system *, control_params *, reax_list **, int, int,
-                             int *, int *, int *, int *);
-
-int Cuda_Estimate_Sparse_Matrix (reax_system *, control_params *,
-                                 simulation_data *, reax_list **);
-
-int Cuda_Init_Forces( reax_system *, control_params *, simulation_data *,
-                      storage *, reax_list **, output_controls *);
-
-int Cuda_Init_Forces_noQEq( reax_system *, control_params *, simulation_data *,
-                            storage *, reax_list **, output_controls *);
-
-int Cuda_Validate_Lists (reax_system *, storage *, reax_list **, control_params *,
-                         int, int, int, int );
-
-int Cuda_Compute_Bonded_Forces (reax_system *, control_params *, simulation_data *,
-                                storage *, reax_list **, output_controls *);
-
-
-void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
-                                    simulation_data *, storage *,
-                                    reax_list **, output_controls *,
-                                    mpi_datatypes *);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_helpers.h b/PG-PuReMD/src/cuda_helpers.h
deleted file mode 100644
index 0d6282a842e40e377d8c8ef11f7c52a0ae8f904d..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_helpers.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef __CUDA_HELPERS__
-#define __CUDA_HELPERS__
-
-#include "reax_types.h"
-
-
-CUDA_DEVICE inline int cuda_strcmp (char *a, char *b, int len)
-{
-    char *src, *dst;
-
-    src = a;
-    dst = b;
-
-    for (int i = 0; i < len; i++)
-    {
-
-        if (*dst == '\0')
-            return 0;
-
-        if (*src != *dst)  return 1;
-
-        src ++;
-        dst ++;
-    }
-
-    return 0;
-}
-
-
-CUDA_DEVICE inline real atomicAdd(real* address, real val)
-{
-    unsigned long long int* address_as_ull =
-        (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do
-    {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val + __longlong_as_double(assumed)));
-    }
-    while (assumed != old);
-
-    return __longlong_as_double(old);
-}
-
-
-CUDA_DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
-{
-    atomicAdd ( &ret[0], v[0] );
-    atomicAdd ( &ret[1], v[1] );
-    atomicAdd ( &ret[2], v[2] );
-}
-
-
-CUDA_DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
-{
-    atomicAdd ( &ret[0], c * v[0] );
-    atomicAdd ( &ret[1], c * v[1] );
-    atomicAdd ( &ret[2], c * v[2] );
-}
-
-#endif
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda_hydrogen_bonds.h
deleted file mode 100644
index 7e1644f19c82d520fd34c036b9d7a2906e97b80f..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __HBONDS_H_
-#define __HBONDS_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *,
-        storage ,
-        reax_list );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *,
-        storage ,
-        reax_list, int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *,
-        storage ,
-        reax_list , int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
-                                      single_body_parameters *,
-                                      hbond_parameters *,
-                                      global_parameters ,
-                                      control_params *,
-                                      storage ,
-                                      reax_list ,
-                                      reax_list ,
-                                      int ,
-                                      int ,
-                                      real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
-        single_body_parameters *,
-        hbond_parameters *,
-        global_parameters ,
-        control_params *,
-        storage ,
-        reax_list ,
-        reax_list ,
-        int ,
-        int ,
-        real *,
-        rvec *);
-
-#endif
diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu
deleted file mode 100644
index 827a63a311d7e62b3a0e1a96a9c2b9f9a275c320..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_init_md.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-
-#include "cuda_init_md.h"
-
-#include "reax_types.h"
-#include "cuda_utils.h"
-
-void Cuda_Init_ScratchArea ()
-{
-    cuda_malloc ((void **)& scratch, SCRATCH_SIZE, 1, "Device:Scratch");
-
-    host_scratch = (void *)malloc (HOST_SCRATCH_SIZE );
-}
diff --git a/PG-PuReMD/src/cuda_init_md.h b/PG-PuReMD/src/cuda_init_md.h
deleted file mode 100644
index 1a2fd6e437568608e8beae15f62d18904934cf1d..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_init_md.h
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#ifndef __CUDA_INIT_MD_H__
-#define __CUDA_INIT_MD_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void Cuda_Init_ScratchArea ();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu
deleted file mode 100644
index 7f042ce97b0eaf9ce0b6e5245f501773a043a845..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_integrate.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-
-#include "cuda_integrate.h"
-#include "reax_types.h"
-
-#include "vector.h"
-#include "cuda_utils.h"
-
-CUDA_GLOBAL void ker_update_velocity_1 (reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        real dt,
-        int n)
-{
-    real inv_m;
-    rvec dx;
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n ) return;
-
-    /* velocity verlet, 1st part */
-    //for( i = 0; i < system->n; i++ ) { 
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute x(t + dt) */
-    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-    rvec_Add( atom->x, dx );
-    /* Compute v(t + dt/2) */
-    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-    //}
-}
-
-void bNVT_update_velocity_part1 (reax_system *system, real dt)
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ker_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-}
-
-CUDA_GLOBAL void ker_update_velocity_2 (reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        real dt,
-        int n)
-{
-    reax_atom *atom;
-    real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n ) return;
-
-    /* velocity verlet, 2nd part */
-    //for( i = 0; i < system->n; i++ ) { 
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute v(t + dt) */
-    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    //}
-}
-
-void bNVT_update_velocity_part2 (reax_system *system, real dt)
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ker_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-}
-
-CUDA_GLOBAL void ker_scale_velocities (reax_atom *my_atoms, real lambda, int n)
-{
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n ) return;
-
-    /* Scale velocities and positions at t+dt */
-    //for( i = 0; i < system->n; ++i ) {
-    atom = &(my_atoms[i]);
-    rvec_Scale( atom->v, lambda, atom->v );
-    //}
-}
-
-void bNVT_scale_velocities (reax_system *system, real lambda)
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ker_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, lambda, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-}
diff --git a/PG-PuReMD/src/cuda_linear_solvers.cu b/PG-PuReMD/src/cuda_linear_solvers.cu
deleted file mode 100644
index 3887ef769be0741ca7dc99c0e49c832c65a17721..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_linear_solvers.cu
+++ /dev/null
@@ -1,361 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_linear_solvers.h"
-
-#include "reax_types.h"
-#include "cuda_utils.h"
-#include "reduction.h"
-#include "dual_matvec.h"
-#include "matvec.h"
-
-
-void get_from_device(real *host, real *device, unsigned int bytes, const char *msg)
-{
-    copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg );
-}
-
-
-void put_on_device(real *host, real *device, unsigned int bytes, const char *msg)
-{
-    copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg );
-}
-
-
-void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count)
-{
-    //res = ax + by
-    //use the cublas here
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, x, b, y, count );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count)
-{
-    //res = a*b - vector multiplication
-    //use the cublas here.
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, b, count );
-
-    cudaThreadSynchronize();
-}
-
-
-CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
-{
-    storage *workspace = &( p_workspace );
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (j >= n)
-    {
-        return;
-    }
-
-    //for( j = 0; j < system->n; ++j ) {
-    // residual 
-    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
-    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-
-    // apply diagonal pre-conditioner
-    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
-    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-    //}
-}
-
-
-void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, b, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x, 
-        real alpha_0, real alpha_1, int n, rvec2 *my_dot)
-{
-    storage *workspace = &( p_workspace );
-    rvec2 alpha;
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    alpha[0] = alpha_0;
-    alpha[1] = alpha_1;
-
-    if (j >= n)
-    {
-        return;
-    }
-
-    my_dot[j][0] = my_dot[j][1] = 0.0;
-
-    //for( j = 0; j < system->n; ++j ) {
-    // update x 
-    x[j][0] += alpha[0] * workspace->d2[j][0];
-    x[j][1] += alpha[1] * workspace->d2[j][1];      
-
-    // update residual 
-    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
-    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
-
-    // apply diagonal pre-conditioner 
-    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-
-    // dot product: r.p 
-    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
-    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-    //}
-}
-
-
-void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *x, rvec2 alpha,
-        int n, rvec2 result)
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1),
-            "cuda_dualcg_preconditioner" );
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, x, alpha[0], alpha[1], n, tmp);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    //Reduction to calculate my_dot
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( tmp, tmp + n, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
-        ( tmp + n, tmp + 2*n, blocks);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
-            cudaMemcpyDeviceToHost, "my_dot" );
-}
-
-
-void Cuda_Norm(rvec2 *arr, int n, rvec2 result)
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-        (arr, tmp, n, INITIAL);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
-        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
-}
-
-
-void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n)
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-        ( a, b, tmp, n );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> 
-        //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> 
-        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_dot" );
-}
-
-
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
-        ( x, a, b[0], b[1], c, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i] = src[i][index];
-}
-
-
-void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i][index] = src[i];
-}
-
-
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof (rvec2) * size, "dual_matvec:result" );
-
-    //One thread per row implementation
-    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-    //One warp per row implementation
-#if defined(__SM_35__)
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                      sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
-#endif
-                      (*H, a, b, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof (real) * size, "dual_matvec:result" );
-
-    //one thread per row implementation
-    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-#if defined(__SM_35__)
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                 sizeof (real) * MATVEC_BLOCK_SIZE>>>
-#endif
-                     (*H, a, b, n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
diff --git a/PG-PuReMD/src/cuda_linear_solvers.h b/PG-PuReMD/src/cuda_linear_solvers.h
deleted file mode 100644
index 368674e64243589f31fe2dd1b3a06f56cc645469..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_linear_solvers.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __CUDA_LINEAR_SOLVERS_H_
-#define __CUDA_LINEAR_SOLVERS_H_
-
-#include "reax_types.h"
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void get_from_device(real *host, real *device, unsigned int bytes, const char *);
-void put_on_device(real *host, real *device, unsigned int bytes, const char *);
-
-void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count);
-void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count);
-void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n);
-void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result);
-void Cuda_Norm(rvec2 *arr, int n, rvec2 result);
-void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n);
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n);
-void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n);
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n);
-void Cuda_Dual_Matvec(sparse_matrix *, rvec2 *, rvec2 *, int , int);
-void Cuda_Matvec(sparse_matrix *, real *, real *, int , int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda_lookup.cu
deleted file mode 100644
index bad6af135bed0a9c7afc2c953f4fe0b10c4ce4f1..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_lookup.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-
-#include "cuda_lookup.h"
-#include "index_utils.h"
-#include "cuda_utils.h"
-#include "reax_types.h"
-
-
-void copy_LR_table_to_device (reax_system *system, control_params *control, int *aggregated)
-{
-    int i, j, r;
-    int num_atom_types;
-    LR_data *d_y;
-    cubic_spline_coef *temp;
-
-    num_atom_types = system->reax_param.num_atom_types;
-
-    fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
-
-    cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table");
-
-    /*
-       for( i = 0; i < MAX_ATOM_TYPES; ++i )
-       existing_types[i] = 0;
-
-       for( i = 0; i < system->N; ++i )
-       existing_types[ system->atoms[i].type ] = 1;
-     */
-
-    copy_host_device ( LR, d_LR, sizeof (LR_lookup_table) * (num_atom_types * num_atom_types), 
-            cudaMemcpyHostToDevice, "LR_lookup:table");
-
-    for( i = 0; i < num_atom_types; ++i )
-        if( aggregated [i] )
-            for( j = i; j < num_atom_types; ++j )
-
-                if( aggregated [j] ) { 
-
-                    cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, 
-                            sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y");
-                    copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, 
-                            sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y");
-
-                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, 
-                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h");
-                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, 
-                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h");
-
-                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, 
-                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW");
-                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,
-                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW");
-
-                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, 
-                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd");
-                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, 
-                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
-
-                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp,
-                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele");
-                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele,
-                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele");
-
-                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb");
-                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp,
-                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
-                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb,
-                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
-                }
-
-    fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n");
-}
-
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu
deleted file mode 100644
index e552ab6ba32683b1e7b91f2f0f1c6c7186b95a10..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ /dev/null
@@ -1,713 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_neighbors.h"
-#include "dev_list.h"
-#include "vector.h"
-
-#include "index_utils.h"
-#include "reax_types.h"
-#include "cuda_utils.h"
-#include "tool_box.h"
-
-//extern "C" real Get_Time( );
-//extern "C" real Get_Timing_Info( real );
-
-CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
-{
-    int  i;  
-    real d_sqr = 0;
-
-    for( i = 0; i < 3; ++i )
-        if( cp[i] > NEG_INF )
-            d_sqr += SQR( cp[i] - x[i] );
-
-    return d_sqr;
-}
-
-
-CUDA_GLOBAL void ker_generate_neighbor_lists (    reax_atom *my_atoms, 
-        simulation_box my_ext_box,
-        grid g,
-        reax_list far_nbrs, 
-        int n, int N )
-{
-    int  i, j, k, l, m, itr, num_far;
-    real d, cutoff;
-    ivec c, nbrs_x;
-    rvec dvec;
-    grid_cell *gci, *gcj;
-    far_neighbor_data *nbr_data;//, *my_start;
-    reax_atom *atom1, *atom2;
-
-    l = blockIdx.x * blockDim.x  + threadIdx.x;
-    if (l >= N) return;
-
-    atom1 = &(my_atoms[l]);
-    num_far = Dev_Start_Index (l, &far_nbrs);
-
-    //get the coordinates of the atom and 
-    //compute the grid cell
-    /*
-       i = (int) (my_atoms[ l ].x[0] * g.inv_len[0]);
-       j = (int) (my_atoms[ l ].x[1] * g.inv_len[1]);
-       k = (int) (my_atoms[ l ].x[2] * g.inv_len[2]);
-     */
-    if (l < n) {
-        for (i = 0; i < 3; i++)
-        {
-            c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-            if( c[i] >= g.native_end[i] )
-                c[i] = g.native_end[i] - 1;
-            else if( c[i] < g.native_str[i] )
-                c[i] = g.native_str[i];
-        }
-    } else {
-        for (i = 0; i < 3; i++)
-        {
-            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-            if( c[i] < 0 ) c[i] = 0;
-            else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-        }
-    }
-
-    i = c[0];
-    j = c[1];
-    k = c[2];
-
-    //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-    cutoff = SQR(g.cutoff[index_grid_3d (i, j, k, &g)]);
-
-    itr = 0;
-    while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-
-        ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-        //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-        if( g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
-            /* pick up another atom from the neighbor cell */
-            for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                    m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
-                if(( l < m )) { // prevent recounting same pairs within a gcell 
-                    atom2 = &(my_atoms[m]);
-                    dvec[0] = atom2->x[0] - atom1->x[0];
-                    dvec[1] = atom2->x[1] - atom1->x[1];
-                    dvec[2] = atom2->x[2] - atom1->x[2];
-                    d = rvec_Norm_Sqr( dvec );
-                    if( d <= cutoff ) { 
-                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-                        nbr_data->nbr = m;
-                        nbr_data->d = SQRT(d);
-                        rvec_Copy( nbr_data->dvec, dvec );
-                        //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
-                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-                                -1, g.rel_box[index_grid_3d (i, j, k, &g)] );
-                        ++num_far;
-                    }
-                }
-                /*
-                   if(( l > m )) { // prevent recounting same pairs within a gcell 
-                   atom2 = &(my_atoms[m]);
-                   dvec[0] = atom1->x[0] - atom2->x[0];
-                   dvec[1] = atom1->x[1] - atom2->x[1];
-                   dvec[2] = atom1->x[2] - atom2->x[2];
-                   d = rvec_Norm_Sqr( dvec );
-                   if( d <= cutoff ) { 
-                   nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-                   nbr_data->nbr = m;
-                   nbr_data->d = SQRT(d);
-                   rvec_Copy( nbr_data->dvec, dvec );
-                   ivec_ScaledSum( nbr_data->rel_box, 
-                   -1, gcj->rel_box, 1, gci->rel_box );
-                   ++num_far;
-                   }
-                   }   
-                 */
-            }
-        ++itr;
-    }   
-
-    itr = 0;
-    while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-        ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-        //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-        cutoff = SQR(g.cutoff[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
-
-        if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
-            for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                    m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
-                if(( l > m )) {
-                    atom2 = &(my_atoms[m]);
-                    dvec[0] = atom1->x[0] - atom2->x[0];
-                    dvec[1] = atom1->x[1] - atom2->x[1];
-                    dvec[2] = atom1->x[2] - atom2->x[2];
-                    d = rvec_Norm_Sqr( dvec );
-                    if( d <= cutoff ) { 
-                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-                        nbr_data->nbr = m;
-                        nbr_data->d = SQRT(d);
-                        rvec_Copy( nbr_data->dvec, dvec );
-                        //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
-                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-                                -1, g.rel_box[index_grid_3d (i, j, k, &g)] );
-                        ++num_far;
-                    }
-                }   
-            }
-        ++itr;
-    }   
-
-    Dev_Set_End_Index( l, num_far, &far_nbrs );
-}
-
-
-CUDA_GLOBAL void ker_mt_generate_neighbor_lists (    reax_atom *my_atoms, 
-        //CUDA_GLOBAL void __launch_bounds__ (1024) ker_mt_generate_neighbor_lists (    reax_atom *my_atoms, 
-        simulation_box my_ext_box,
-        grid g,
-        reax_list far_nbrs, 
-        int n, int N )
-        {
-
-        extern __shared__ int __nbr[];
-        extern __shared__ int __sofar [];
-        bool  nbrgen;
-
-        int __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM;
-
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-        int warp_id = thread_id / __THREADS_PER_ATOM__;
-        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-        if (warp_id >= N ) return;
-
-        int *tnbr = __nbr;
-        int *nbrssofar = __nbr + blockDim.x;
-        int max, leader;
-
-        int  i, j, k, l, m, itr, num_far, ll;
-        real d, cutoff, cutoff_ji;
-        ivec c, nbrs_x;
-        rvec dvec;
-        grid_cell *gci, *gcj;
-        far_neighbor_data *nbr_data, *my_start;
-        reax_atom *atom1, *atom2;
-
-        //l = blockIdx.x * blockDim.x  + threadIdx.x;
-        //if (l >= N) return;
-
-        l = warp_id;
-
-        atom1 = &(my_atoms[l]);
-        num_far = Dev_Start_Index (l, &far_nbrs);
-
-        my_start = &( far_nbrs.select.far_nbr_list [num_far] );
-
-        //get the coordinates of the atom and 
-        //compute the grid cell
-        if (l < n) {
-            for (i = 0; i < 3; i++)
-            {
-                c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-                if( c[i] >= g.native_end[i] )
-                    c[i] = g.native_end[i] - 1;
-                else if( c[i] < g.native_str[i] )
-                    c[i] = g.native_str[i];
-            }
-        } else {
-            for (i = 0; i < 3; i++)
-            {
-                c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-                if( c[i] < 0 ) c[i] = 0;
-                else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-            }
-        }
-
-        i = c[0];
-        j = c[1];
-        k = c[2];
-
-        //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-
-
-        tnbr[threadIdx.x] = 0;
-        if (lane_id == 0) {
-            nbrssofar [my_bucket] = 0;
-        }
-        __syncthreads ();
-
-        itr = 0;
-        //while( (gci->nbrs_x[itr][0]) >= 0 ) { 
-        while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-
-            tnbr[threadIdx.x] = 0;
-            nbrgen = false;
-
-            //ivec_Copy (nbrs_x, gci->nbrs_x[itr] );
-            ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-            //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-            //cutoff = SQR(gci->cutoff);
-            cutoff = SQR (g.cutoff [index_grid_3d (i, j, k, &g)]);
-            //cutoff_ji = SQR(gcj->cutoff);
-            cutoff_ji = SQR(g.cutoff[ index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-            //if( ((gci->str <= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff)) 
-            //     || ((gci->str >= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff_ji)))
-            if( ((g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
-                        && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff)) 
-                    || ((g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
-                        && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff_ji)))
-            {
-                //max = gcj->end - gcj->str;
-                max = g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] - g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)];
-                tnbr[threadIdx.x] = 0;
-                nbrgen = false;
-                //m = lane_id  + gcj->str; //0-31
-                m = lane_id  + g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31
-                int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
-                int iterations = 0;
-
-                // pick up another atom from the neighbor cell
-                //for( m = gcj->str; m < gcj->end; ++m ) 
-                while (iterations < loopcount) {
-                    tnbr [threadIdx.x] = 0;
-                    nbrgen = false;
-
-                    //if(( l < m ) && (m < gcj->end)) { // prevent recounting same pairs within a gcell 
-                    if(( l < m ) && (m < g.end [index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { // prevent recounting same pairs within a gcell 
-                        atom2 = &(my_atoms[m]);
-                        dvec[0] = atom2->x[0] - atom1->x[0];
-                        dvec[1] = atom2->x[1] - atom1->x[1];
-                        dvec[2] = atom2->x[2] - atom1->x[2];
-                        d = rvec_Norm_Sqr( dvec );
-                        if( d <= cutoff ) { 
-                            tnbr [threadIdx.x] = 1;
-                            nbrgen = true;
-                        }
-                    }
-
-                    //if(( l > m ) && (m < gcj->end)) {
-                    if(( l > m ) && (m < g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) {
-                        atom2 = &(my_atoms[m]);
-                        dvec[0] = atom1->x[0] - atom2->x[0];
-                        dvec[1] = atom1->x[1] - atom2->x[1];
-                        dvec[2] = atom1->x[2] - atom2->x[2];
-                        d = rvec_Norm_Sqr( dvec );
-                        if( d <= cutoff_ji ) { 
-                            tnbr [threadIdx.x] = 1;
-                            nbrgen = true;
-                        }
-                    } 
-
-                    //is neighbor generated
-                    if (nbrgen)
-                    {
-                        //do leader selection here
-                        leader = -1;
-                        for (ll = my_bucket *__THREADS_PER_ATOM__; ll < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++)
-                            if (tnbr[ll]){
-                                leader = ll;
-                                break;
-                            }
-
-                        //do the reduction;
-                        if (threadIdx.x == leader)
-                            for (ll = 1; ll < __THREADS_PER_ATOM__; ll++)
-                                tnbr [my_bucket * __THREADS_PER_ATOM__ + ll] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (ll-1)];
-                    }
-
-                    if (nbrgen)
-                    {
-                        //got the indices
-                        nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
-                        nbr_data->nbr = m;
-                        if (l < m) {
-                            dvec[0] = atom2->x[0] - atom1->x[0];
-                            dvec[1] = atom2->x[1] - atom1->x[1];
-                            dvec[2] = atom2->x[2] - atom1->x[2];
-                            d = rvec_Norm_Sqr( dvec );
-                            nbr_data->d = SQRT (d);
-                            rvec_Copy( nbr_data->dvec, dvec );
-                            //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
-                            ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-                                    -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-                        } 
-                        else {
-                            dvec[0] = atom1->x[0] - atom2->x[0];
-                            dvec[1] = atom1->x[1] - atom2->x[1];
-                            dvec[2] = atom1->x[2] - atom2->x[2];
-                            d = rvec_Norm_Sqr( dvec );
-                            nbr_data->d = SQRT(d);
-                            rvec_Copy( nbr_data->dvec, dvec );
-                            //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
-                            /*
-                               CHANGE ORIGINAL
-                               This is a bug in the original code 
-                               ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-                               -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-                             */
-                            ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-                                    1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-                        }
-
-                        if (threadIdx.x == leader)
-                            nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-                    }
-
-                    m += __THREADS_PER_ATOM__;
-                    iterations ++;
-
-                    //cleanup
-                    nbrgen = false;
-                    tnbr [threadIdx.x] = 0;
-                }
-                }
-                ++itr;
-                }   
-
-                if (lane_id == 0)
-                    Dev_Set_End_Index (l, num_far + nbrssofar[my_bucket], &far_nbrs);
-                //Dev_Set_End_Index( l, num_far, &far_nbrs );
-            }
-
-
-
-            CUDA_GLOBAL void ker_count_total_nbrs (reax_list far_nbrs, int N, int *result)
-            {
-                //strided access
-                extern __shared__ int count[];
-                unsigned int i = threadIdx.x;
-                int my_count = 0;
-                count[i] = 0;
-
-                for (i = threadIdx.x; i < N; i += threadIdx.x + blockDim.x)
-                    count[threadIdx.x] += Dev_Num_Entries (i, &far_nbrs);
-
-                __syncthreads ();
-
-                for (int offset = blockDim.x/2; offset > 0; offset >>=1 )
-                    if(threadIdx.x < offset)
-                        count [threadIdx.x] += count [threadIdx.x + offset];
-
-                __syncthreads ();
-
-                if (threadIdx.x == 0)
-                    *result = count [threadIdx.x];
-            }
-
-            extern "C" void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
-                    storage *workspace, reax_list **lists )
-            {
-                int blocks, num_far;
-                int *d_num_far = (int *) scratch;
-#if defined(LOG_PERFORMANCE)
-                real t_start=0, t_elapsed=0;
-
-                if( system->my_rank == MASTER_NODE )
-                    t_start = Get_Time( );
-#endif
-
-                cuda_memset (d_num_far, 0, sizeof (int), "num_far");
-
-                //invoke the kernel here
-                //one thread per atom implementation
-                /*
-                   blocks = (system->N / NBRS_BLOCK_SIZE) + 
-                   ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-                   ker_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE>>>
-                   (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-                 *(*dev_lists + FAR_NBRS), system->n, system->N);
-                 cudaThreadSynchronize ();
-                 cudaCheckError ();
-                 */
-
-                //Multiple threads per atom implementation
-                blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
-                    (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-                ker_mt_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE, 
-                                   //sizeof (int) * (NBRS_BLOCK_SIZE + (NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM)) >>>
-                                   sizeof (int) *  2 * (NBRS_BLOCK_SIZE) >>>
-                                       (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-                                    *(*dev_lists + FAR_NBRS), system->n, system->N);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
-
-                /*
-                   ker_count_total_nbrs  <<<1, NBRS_BLOCK_SIZE, sizeof (int) * NBRS_BLOCK_SIZE>>>
-                   (*(*dev_lists + FAR_NBRS), system->N, d_num_far);
-                   cudaThreadSynchronize ();
-                   cudaCheckError ();
-                   copy_host_device (&num_far, d_num_far, sizeof (int), cudaMemcpyDeviceToHost, "num_far");
-                 */
-
-                int *index = (int *) host_scratch;
-                memset (index , 0, 2 * sizeof (int) * system->N);
-                int *end_index = index + system->N;
-
-                copy_host_device (index, (*dev_lists + FAR_NBRS)->index, 
-                        sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:index");
-                copy_host_device (end_index, (*dev_lists + FAR_NBRS)->end_index, 
-                        sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:end_index");
-
-                num_far = 0;
-                for (int i = 0; i < system->N; i++)
-                    num_far = end_index[i] - index[i];
-
-                dev_workspace->realloc.num_far = num_far;
-
-#if defined(LOG_PERFORMANCE)
-                if( system->my_rank == MASTER_NODE ) {
-                    t_elapsed = Get_Timing_Info( t_start );
-                    data->timing.nbrs += t_elapsed;
-                }
-#endif
-
-#if defined(DEBUG_FOCUS)  
-                fprintf( stderr, "p%d @ step%d: nbrs done - num_far=%d\n", 
-                        system->my_rank, data->step, num_far );
-                MPI_Barrier( MPI_COMM_WORLD );
-#endif
-            }
-
-            CUDA_GLOBAL void ker_estimate_neighbors (    reax_atom *my_atoms, 
-                    simulation_box my_ext_box,
-                    grid g,
-                    int n,
-                    int N, 
-                    int *indices)
-            {
-                int  i, j, k, l, m, itr, num_far;
-                real d, cutoff;
-                rvec dvec, c;
-                ivec nbrs_x;
-                grid_cell *gci, *gcj;
-                far_neighbor_data *nbr_data;//, *my_start;
-                reax_atom *atom1, *atom2;
-
-                l = blockIdx.x * blockDim.x  + threadIdx.x;
-                if (l >= N) return;
-
-                num_far = 0;
-                atom1 = &(my_atoms[l]);
-                indices [l] = 0;
-
-                //if (atom1->orig_id < 0) return;
-
-                //get the coordinates of the atom and 
-                //compute the grid cell
-                if (l < n) {
-                    for (i = 0; i < 3; i++)
-                    {
-                        c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-                        if( c[i] >= g.native_end[i] )
-                            c[i] = g.native_end[i] - 1;
-                        else if( c[i] < g.native_str[i] )
-                            c[i] = g.native_str[i];
-                    }
-                } else {
-                    for (i = 0; i < 3; i++)
-                    {
-                        c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-                        if( c[i] < 0 ) c[i] = 0;
-                        else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-                    }
-                }
-
-                i = c[0];
-                j = c[1];
-                k = c[2];
-
-                //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-                //cutoff = SQR(gci->cutoff);
-                cutoff = SQR(g.cutoff [index_grid_3d (i, j, k, &g) ]);
-
-                itr = 0;
-                while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
-                    ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-                    //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-                    if( //(g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) &&  
-                            (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
-                    {
-                        // pick up another atom from the neighbor cell 
-                        for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                                m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-                        {
-                            if( l < m ) { // prevent recounting same pairs within a gcell 
-                                atom2 = &(my_atoms[m]);
-                                dvec[0] = atom2->x[0] - atom1->x[0];
-                                dvec[1] = atom2->x[1] - atom1->x[1];
-                                dvec[2] = atom2->x[2] - atom1->x[2];
-                                d = rvec_Norm_Sqr( dvec );
-                                if( d <= cutoff ) { 
-                                    num_far ++;
-                                }
-                            }   
-                        }
-                    }
-                    ++itr;
-
-                }   
-
-                itr = 0;
-                while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
-                    ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-                    //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-                    cutoff = SQR(g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
-
-                    if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                            (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
-                    {
-                        // pick up another atom from the neighbor cell 
-                        for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                                m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-                        {
-                            if( l > m ) { // prevent recounting same pairs within a gcell 
-                                atom2 = &(my_atoms[m]);
-                                dvec[0] = atom2->x[0] - atom1->x[0];
-                                dvec[1] = atom2->x[1] - atom1->x[1];
-                                dvec[2] = atom2->x[2] - atom1->x[2];
-                                d = rvec_Norm_Sqr( dvec );
-                                if( d <= cutoff ) { 
-                                    num_far ++;
-                                }
-                            }   
-                        }
-                    }
-                    ++itr;
-                }   
-
-                indices [l] = num_far;// * SAFE_ZONE;
-            }
-
-            void Cuda_Estimate_Neighbors( reax_system *system, int *nbr_indices )
-            {
-                int blocks, num_nbrs;
-                int *indices = (int *) scratch;
-                reax_list *far_nbrs;
-
-                cuda_memset (indices, 0, sizeof (int) * system->total_cap, 
-                        "neighbors:indices");
-
-                blocks = system->N / DEF_BLOCK_SIZE + 
-                    ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-                ker_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
-                    (system->d_my_atoms, (system->my_ext_box), system->d_my_grid, 
-                     system->n, system->N, indices);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
-
-                copy_host_device (nbr_indices, indices, sizeof (int) * system->total_cap, 
-                        cudaMemcpyDeviceToHost, "nbrs:indices");
-            }
-
-            void Cuda_Init_Neighbors_Indices (int *indices, int entries)
-            {
-                reax_list *far_nbrs = *dev_lists + FAR_NBRS;
-
-                copy_host_device (indices, (far_nbrs->index + 1), (entries -1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "nbrs:index");
-                copy_host_device (indices, (far_nbrs->end_index + 1), (entries-1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "nbrs:end_index");
-            }
-
-            void Cuda_Init_HBond_Indices (int *indices, int entries)
-            {
-                reax_list *hbonds = *dev_lists + HBONDS;
-
-                for (int i = 1 ; i < entries; i++)
-                    indices [i] += indices [i-1];
-
-                copy_host_device (indices, hbonds->index + 1, (entries-1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "hbonds:index");
-                copy_host_device (indices, hbonds->end_index + 1, (entries-1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "hbonds:end_index");
-            }
-
-            void Cuda_Init_Bond_Indices (int *indices, int entries, int num_intrs)
-            {
-                reax_list *bonds = *dev_lists + BONDS;
-
-                indices[0] = MAX( indices[0]*2, MIN_BONDS);
-                for (int i = 1 ; i < entries; i++) {
-                    indices[i] = MAX( indices[i]*2, MIN_BONDS);
-                }
-
-                for (int i = 1 ; i < entries; i++) {
-                    indices[i] += indices[i-1];
-                }
-
-                copy_host_device (indices, (bonds->index + 1), (entries - 1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "bonds:index");
-                copy_host_device (indices, (bonds->end_index + 1), (entries - 1) * sizeof (int), 
-                        cudaMemcpyHostToDevice, "bonds:end_index");
-
-                for (int i = 1 ; i < entries; i++)
-                    if (indices [i] > num_intrs) {
-                        fprintf (stderr, "We have a problem here ==> %d index: %d, num_intrs: %d \n", 
-                                i, indices[i], num_intrs);    
-                        exit (-1);
-                    }
-            }
-
-            /*
-
-               CUDA_GLOBAL void ker_validate_neighbors (reax_atom *my_atoms, 
-               reax_list far_nbrs, 
-               int N)
-               {
-               int i, j, pj;
-               far_neighbor_data *nbr_pj;
-               reax_atom *atom_i;
-               int start_i, end_i;
-
-               i = blockIdx.x * blockDim.x + threadIdx.x;
-               if (i >= N) return;
-
-               atom_i = &( my_atoms[i] );
-               start_i = Dev_Start_Index (i, &far_nbrs );
-               end_i = Dev_End_Index (i, &far_nbrs );
-
-               for( pj = start_i; pj < end_i; ++pj ) {
-               nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-               j = nbr_pj->nbr;
-               nbr_pj->d = 0;
-               rvec_MakeZero (nbr_pj->dvec);
-               }
-               }
-
-               void validate_neighbors (reax_system *system)
-               {
-               int blocks;
-               blocks = (system->N / NBRS_BLOCK_SIZE) + 
-               ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-               ker_validate_neighbors <<< blocks, NBRS_BLOCK_SIZE>>>
-               (system->d_my_atoms, *(*dev_lists + FAR_NBRS), system->N);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-
-               fprintf (stderr, " Neighbors validated and is fine... \n");
-               }
-
-             */
diff --git a/PG-PuReMD/src/cuda_neighbors.h b/PG-PuReMD/src/cuda_neighbors.h
deleted file mode 100644
index 01adb40b4aea61f31354a4e35f6c86a20c6f2303..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_neighbors.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-#ifndef __CUDA_NEIGHBORS_H__
-#define __CUDA_NEIGHBORS_H__
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-void Cuda_Generate_Neighbors (reax_system *, simulation_data *, storage *, reax_list **);
-void Cuda_Estimate_Neighbors( reax_system *, int *);
-void Cuda_Init_Neighbors_Indices (int *, int);
-
-void Cuda_Init_HBond_Indices (int *, int);
-void Cuda_Init_Bond_Indices (int *, int, int);
-
-//void validate_neighbors (reax_system *system);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda_nonbonded.cu
deleted file mode 100644
index 15eae7bc7a1bc370f6c7037f197913a8aa4d4dd9..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_nonbonded.cu
+++ /dev/null
@@ -1,619 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_nonbonded.h"
-#include "reax_types.h"
-#include "index_utils.h"
-#include "dev_list.h"
-#include "vector.h"
-#include "cuda_utils.h"
-#include "reduction.h"
-
-#include "cuda_shuffle.h"
-
-CUDA_GLOBAL void ker_vdW_coulomb_energy( 
-        //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy(    
-        reax_atom *my_atoms, 
-        two_body_parameters *tbp,
-        global_parameters gp, 
-        control_params *control, 
-        storage p_workspace, 
-        reax_list p_far_nbrs, 
-        int n, int N, int num_atom_types, 
-        real *data_e_vdW, real *data_e_ele, 
-        rvec *data_ext_press)
-        {
-
-#if defined(__SM_35__)
-        real sh_vdw;
-        real sh_ele;
-        rvec sh_force;
-
-#else
-
-        extern __shared__ real _vdw[];
-        extern __shared__ real _ele[];
-        extern __shared__ rvec _force [];
-
-        real *sh_vdw;
-        real *sh_ele;
-        rvec *sh_force;
-
-#endif
-
-
-        int i, j, pj, natoms;
-        int start_i, end_i, orig_i, orig_j;
-        real p_vdW1, p_vdW1i;
-        real powr_vdW1, powgi_vdW1;
-        real tmp, r_ij, fn13, exp1, exp2;
-        real Tap, dTap, dfn13, CEvd, CEclmb, de_core;
-        real dr3gamij_1, dr3gamij_3;
-        real e_ele, e_vdW, e_core;
-        rvec temp, ext_press;
-        two_body_parameters *twbp;
-        far_neighbor_data *nbr_pj;
-        reax_list *far_nbrs;
-        storage *workspace = &( p_workspace );
-        // rtensor temp_rtensor, total_rtensor;
-
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-        int warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
-        int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
-
-#if defined(__SM_35__)
-        sh_vdw = 0.0;
-        sh_ele = 0.0;
-        rvec_MakeZero ( sh_force );
-#else
-        sh_vdw = _vdw;
-        sh_ele = _vdw + blockDim.x;
-        sh_force = (rvec *)( _vdw + 2*blockDim.x);
-
-        sh_vdw[threadIdx.x] = 0.0;
-        sh_ele[threadIdx.x] = 0.0;
-        rvec_MakeZero ( sh_force [threadIdx.x] );
-#endif
-
-        //i = blockIdx.x * blockDim.x + threadIdx.x;
-        //if (i >= N) return;
-        i = warpid;
-
-        if (i < N)
-        {
-            natoms = n;
-            far_nbrs = &( p_far_nbrs );
-            p_vdW1 = gp.l[28];
-            p_vdW1i = 1.0 / p_vdW1;
-            e_core = 0;
-            e_vdW = 0;
-
-            data_e_vdW [i] = 0;
-            data_e_ele [i] = 0;
-
-            //for( i = 0; i < natoms; ++i ) {
-            start_i = Dev_Start_Index(i, far_nbrs);
-            end_i   = Dev_End_Index(i, far_nbrs);
-            orig_i  = my_atoms[i].orig_id;
-            //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
-
-            //for( pj = start_i; pj < end_i; ++pj )
-            pj = start_i + laneid;
-            while (pj < end_i)
-            {
-
-                nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
-                j = nbr_pj->nbr;
-                orig_j  = my_atoms[j].orig_id;
-
-                if( nbr_pj->d <= control->nonb_cut && 
-                        (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
-                         || ((i > j) && (i < natoms) && (j < natoms)) 
-                         || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
-                { // ji with j >= n
-                    r_ij = nbr_pj->d;
-                    twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]);
-
-                    /* Calculate Taper and its derivative */
-                    // Tap = nbr_pj->Tap;   -- precomputed during compte_H
-                    Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
-                    Tap = Tap * r_ij + workspace->Tap[5];
-                    Tap = Tap * r_ij + workspace->Tap[4];
-                    Tap = Tap * r_ij + workspace->Tap[3];
-                    Tap = Tap * r_ij + workspace->Tap[2];
-                    Tap = Tap * r_ij + workspace->Tap[1];
-                    Tap = Tap * r_ij + workspace->Tap[0];
-
-                    dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6];
-                    dTap = dTap * r_ij + 5*workspace->Tap[5];
-                    dTap = dTap * r_ij + 4*workspace->Tap[4];
-                    dTap = dTap * r_ij + 3*workspace->Tap[3];
-                    dTap = dTap * r_ij + 2*workspace->Tap[2];
-                    dTap += workspace->Tap[1]/r_ij;
-
-                    /*vdWaals Calculations*/
-                    if(gp.vdw_type==1 || gp.vdw_type==3)
-                    { // shielding
-                        powr_vdW1 = POW(r_ij, p_vdW1);
-                        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-                        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-                        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-                        e_vdW = twbp->D * (exp1 - 2.0 * exp2);      
-
-                        //data_e_vdW [i] += Tap * e_vdW;
-                        //     data_e_vdW [i] += Tap * e_vdW / 2.0;
-#if defined(__SM_35__)
-                        sh_vdw  += Tap * e_vdW / 2.0;
-#else
-                        sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
-#endif
-
-                        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                            POW(r_ij, p_vdW1 - 2.0);
-
-                        CEvd = dTap * e_vdW - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-                    }
-                    else{ // no shielding
-                        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-
-                        e_vdW = twbp->D * (exp1 - 2.0 * exp2);
-
-                        //data_e_vdW [i] += Tap * e_vdW;
-                        //data_e_vdW [i] += Tap * e_vdW / 2.0;
-#if defined(__SM_35__)
-                        sh_vdw += Tap * e_vdW / 2.0;
-#else
-                        sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
-#endif
-
-                        CEvd = dTap * e_vdW - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-                    }
-
-                    if(gp.vdw_type==2 || gp.vdw_type==3)
-                    { // innner wall
-                        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-
-                        //data_e_vdW [i] += Tap * e_core;
-                        //data_e_vdW [i] += Tap * e_core / 2.0;
-#if defined(__SM_35__)
-                        sh_vdw += Tap * e_core / 2.0;
-#else
-                        sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0;
-#endif
-
-                        de_core = -(twbp->acore/twbp->rcore) * e_core;
-                        CEvd += dTap * e_core + Tap * de_core;
-                    }
-
-                    /*Coulomb Calculations*/
-                    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-                    tmp = Tap / dr3gamij_3;
-                    //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
-                    e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
-                    //data_e_ele [i] += e_ele;
-                    //data_e_ele [i] += e_ele  / 2.0;
-#if defined(__SM_35__)
-                    sh_ele += e_ele  / 2.0;
-#else
-                    sh_ele [ threadIdx.x ] += e_ele  / 2.0;
-#endif
-
-
-                    CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * 
-                        ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                    // fprintf( fout, "%5d %5d %10.6f %10.6f\n",
-                    //   MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ),
-                    //   MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), 
-                    //   CEvd, CEclmb );                  
-
-                    if( control->virial == 0 ) {
-                        if ( i < j ) 
-                            //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-#if defined (__SM_35__)
-                            rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec );
-#else
-                        rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec );
-#endif
-                        else 
-                            //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
-#if defined (__SM_35__)
-                            rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec );
-#else
-                        rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
-#endif
-                        //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
-                    }
-                    else { /* NPT, iNPT or sNPT */
-                        /* for pressure coupling, terms not related to bond order 
-                           derivatives are added directly into pressure vector/tensor */
-                        rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-                        rvec_ScaledAdd( workspace->f[i], -1., temp );
-                        rvec_Add( workspace->f[j], temp );
-
-                        rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                        rvec_Add( data_ext_press [i], ext_press );
-
-                        // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)
-                        //   force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", 
-                        //   i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2],
-                        //   temp[0], temp[1], temp[2],
-                        //   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-                    }
-
-#ifdef TEST_ENERGY
-                    // fprintf( out_control->evdw, 
-                    // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
-                    // workspace->Tap[7],workspace->Tap[6],workspace->Tap[5],
-                    // workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], 
-                    // workspace->Tap[1], Tap );
-                    //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-                    fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-                            r_ij, e_vdW, data->my_en.e_vdW );
-                    //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                    fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-                            r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
-                            e_ele, data->my_en.e_ele );
-#endif
-#ifdef TEST_FORCES
-                    rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                    rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                    rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                    rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
-#endif
-                }
-
-                pj += VDW_KER_THREADS_PER_ATOM;
-
-            }
-            //  }
-        } // if i < N
-
-#if defined( __SM_35__)
-        for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){
-            sh_vdw += shfl( sh_vdw, x);
-            sh_ele += shfl( sh_ele, x );
-            sh_force[0] += shfl( sh_force[0], x );
-            sh_force[1] += shfl( sh_force[1], x );
-            sh_force[2] += shfl( sh_force[2], x );
-        }
-
-        if (laneid == 0) {
-            data_e_vdW[i] += sh_vdw;
-            data_e_ele[i] += sh_ele;
-            rvec_Add (workspace->f[i], sh_force );
-        }
-
-#else
-
-        __syncthreads ();
-
-        if (laneid < 16) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-        }
-        __syncthreads ();
-        if (laneid < 8) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-        }
-        __syncthreads ();
-        if (laneid < 4) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-        }
-        __syncthreads ();
-        if (laneid < 2) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-        }
-        __syncthreads ();
-        if (laneid < 1) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-        }
-        __syncthreads ();
-        if (laneid == 0) {
-            data_e_vdW[i] += sh_vdw[threadIdx.x];
-            data_e_ele[i] += sh_ele[threadIdx.x];
-            rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]);
-        }
-#endif
-
-        }
-
-
-CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
-        global_parameters gp, 
-        control_params *control, 
-        storage p_workspace, 
-        reax_list p_far_nbrs, 
-        LR_lookup_table *t_LR,
-        int n, int N, int num_atom_types, 
-        int step, int prev_steps, 
-        int energy_update_freq, 
-        real *data_e_vdW, real *data_e_ele, 
-        rvec *data_ext_press)
-{
-    int i, j, pj, r, natoms, steps, update_freq, update_energies;
-    int type_i, type_j, tmin, tmax;
-    int start_i, end_i, orig_i, orig_j;
-    real r_ij, base, dif;
-    real e_vdW, e_ele;
-    real CEvd, CEclmb;
-    rvec temp, ext_press;
-    far_neighbor_data *nbr_pj;
-    reax_list *far_nbrs;
-    LR_lookup_table *t;
-
-    storage *workspace = &( p_workspace );
-
-    natoms = n;
-    far_nbrs = &( p_far_nbrs );
-    steps = step - prev_steps;
-    update_freq = energy_update_freq;
-    update_energies = update_freq > 0 && steps % update_freq == 0;
-    e_ele = e_vdW = 0;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    data_e_vdW [i] = 0;
-    data_e_ele [i] = 0;
-
-    //for( i = 0; i < natoms; ++i ) {
-    type_i  = my_atoms[i].type;
-    start_i = Dev_Start_Index(i,far_nbrs);
-    end_i   = Dev_End_Index(i,far_nbrs);
-    orig_i  = my_atoms[i].orig_id;
-
-    for( pj = start_i; pj < end_i; ++pj ) {
-        nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
-        j = nbr_pj->nbr;
-        orig_j  = my_atoms[j].orig_id;
-
-        //if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) {
-        if( nbr_pj->d <= control->nonb_cut && 
-                (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
-                 || ((i > j) && (i < natoms) && (j < natoms)) 
-                 || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
-        { // ji with j >= n
-            j = nbr_pj->nbr;
-            type_j = my_atoms[j].type;
-            r_ij   = nbr_pj->d;
-            tmin  = MIN( type_i, type_j );
-            tmax  = MAX( type_i, type_j );
-
-            t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]);    
-
-            // table = &( LR[type_i][type_j] ); 
-
-            /* Cubic Spline Interpolation */
-            r = (int)(r_ij * t->inv_dx);
-            if( r == 0 )  ++r;
-            base = (real)(r+1) * t->dx;
-            dif = r_ij - base;
-            //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
-
-            if( update_energies ) {
-                e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                    t->vdW[r].a;
-
-                e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                    t->ele[r].a;
-                e_ele *= my_atoms[i].q * my_atoms[j].q;
-
-                //data_e_vdW [i] += e_vdW;
-                data_e_vdW [i] += e_vdW / 2.0;
-                //data_e_ele [i] += e_ele;
-                data_e_ele [i] += e_ele / 2.0;
-            }    
-
-            CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                t->CEvd[r].a;
-
-            CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                t->CEclmb[r].a;
-            CEclmb *= my_atoms[i].q * my_atoms[j].q;
-
-            if( control->virial == 0 ) {
-                if ( i < j ) 
-                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-                else 
-                    rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
-                //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-                //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
-            }
-            else { // NPT, iNPT or sNPT
-                /* for pressure coupling, terms not related to bond order derivatives
-                   are added directly into pressure vector/tensor */
-                rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-                rvec_ScaledAdd( workspace->f[i], -1., temp );
-                rvec_Add( workspace->f[j], temp );
-
-                rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                rvec_Add( data_ext_press [i], ext_press );
-            }
-
-#ifdef TEST_ENERGY
-            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-                    r_ij, e_vdW, data->my_en.e_vdW );
-            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-                    r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
-                    e_ele, data->my_en.e_ele );
-#endif
-#ifdef TEST_FORCES
-            rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-            rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-            rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-            rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
-#endif
-        }
-    }
-    //  }
-}
-
-CUDA_GLOBAL void ker_pol_energy (reax_atom *my_atoms, 
-        single_body_parameters *sbp, 
-        int n, 
-        real *data_e_pol)
-{
-    int type_i;
-    real q;
-
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n) return;
-
-    data_e_pol [i] = 0;
-
-    //for( i = 0; i < system->n; i++ ) {
-    q = my_atoms[i].q;
-    type_i = my_atoms[i].type;
-
-    data_e_pol[i] += 
-        KCALpMOL_to_EV * (sbp[type_i].chi * q + 
-                (sbp[type_i].eta / 2.) * SQR(q));
-    //}
-}
-
-void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *data )
-{
-    int blocks;
-    real *spad = (real *) scratch;
-    cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "pol_energy");
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, 
-          system->n, spad );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //Reduction for polarization energy
-    k_reduction <<< blocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-        ( spad, spad + system->n, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<< 1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>>
-        ( spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, blocks);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-}
-
-void Cuda_NonBonded_Energy ( reax_system *system, control_params *control, 
-        storage *workspace, simulation_data *data,  reax_list **lists,
-        output_controls *out_control, bool isTabulated )
-{
-    int blocks;
-    int rblocks;
-    int size = (2 * system->N + 2 * system->N ) * sizeof (real) + 
-        2 * system->N * sizeof (rvec);
-
-    rvec *spad_rvec;
-    real *spad = (real *) scratch;
-    cuda_memset (spad, 0, size, "pol_energy");
-
-    rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) 
-        + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    if (!isTabulated) {
-        ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
-            ( system->d_my_atoms, system->reax_param.d_tbp, 
-              system->reax_param.d_gp, (control_params *)control->d_control_params, 
-              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
-              system->n, system->N, system->reax_param.num_atom_types, 
-              spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N));
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    } else {
-        ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
-            ( system->d_my_atoms, system->reax_param.d_gp, 
-              (control_params *)control->d_control_params, 
-              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
-              d_LR, system->n, system->N,
-              system->reax_param.num_atom_types, 
-              data->step, data->prev_steps, 
-              out_control->energy_update_freq,
-              spad, spad + 2 * system->N, 
-              (rvec *)(spad + 4 * system->N));
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-
-    //reduction for  vdw
-    k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-        ( spad, spad + system->N, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
-        ( spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, rblocks); 
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //reduction for  ele
-    k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-        ( spad + 2 * system->N, spad + 3 * system->N, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
-        ( spad + 3 * system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ele, rblocks);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    //reduction for ext_press
-    spad_rvec = (rvec *) (spad + 4 * system->N);
-    k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>>
-        ( spad_rvec, spad_rvec + system->N, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
-        ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    Cuda_Compute_Polarization_Energy( system, data );
-}
diff --git a/PG-PuReMD/src/cuda_qEq.cu b/PG-PuReMD/src/cuda_qEq.cu
deleted file mode 100644
index f5f341ec5af355fbdb59c5f85a0e2439d086677d..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_qEq.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_qEq.h"
-
-#include "reax_types.h"
-#include "reduction.h"
-#include "cuda_utils.h"
-
-#include "validation.h"
-
-
-CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, single_body_parameters
-        *sbp, storage p_workspace, int n  )
-{
-    storage *workspace = &( p_workspace );
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    //for( i = 0; i < system->n; ++i ) {
-    atom = &( my_atoms[i] );
-
-    /* init pre-conditioner for H and init solution vectors */
-    workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta;
-    workspace->b_s[i] = -sbp[ atom->type ].chi;
-    workspace->b_t[i] = -1.0;
-    workspace->b[i][0] = -sbp[ atom->type ].chi;
-    workspace->b[i][1] = -1.0;
-
-    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-
-    /* cubic extrapolation for s and t */
-    workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]);
-    //}
-}
-
-
-void Cuda_Init_MatVec( reax_system *system, storage *workspace )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-
-    ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, 
-          *dev_workspace, system->n );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
-
-
-void cuda_charges_x(reax_system *system, rvec2 my_sum)
-{
-    int blocks;
-    rvec2 *output = (rvec2 *) scratch;
-    cuda_memset( output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q" );
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-        ( dev_workspace->x, output, system->n );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
-        ( output, output + system->n, blocks );
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x" );
-}
-
-
-CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, 
-        real u, real *q, int n)
-{
-    storage *workspace = &( p_workspace );
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    //for( i = 0; i < system->n; ++i ) {
-    atom = &( my_atoms[i] );
-
-    //atom->q = workspace->s[i] - u * workspace->t[i];
-    q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
-
-    atom->s[3] = atom->s[2];
-    atom->s[2] = atom->s[1];
-    atom->s[1] = atom->s[0];
-    //atom->s[0] = workspace->s[i];
-    atom->s[0] = workspace->x[i][0];
-
-    atom->t[3] = atom->t[2];
-    atom->t[2] = atom->t[1];
-    atom->t[1] = atom->t[0];
-    //atom->t[0] = workspace->t[i];
-    atom->t[0] = workspace->x[i][1];
-    //}
-}
-//TODO if we use the function argument (output), we are getting 
-//TODO Address not mapped/Invalid permissions error with segmentation fault
-//TODO so using the local argument, which is a global variable anyways. 
-//TODO NEED TO INVESTIGATE MORE ON THIS ISSUE
-//TODO
-//TODO
-//TODO
-
-
-extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *output, real u)
-{
-    int blocks;
-    real *tmp = (real *) scratch;
-    real *tmp_output = (real *) host_scratch;
-
-    cuda_memset( tmp, 0, sizeof (real) * system->n, "charges:q" );
-    memset( tmp_output, 0, sizeof (real) * system->n );
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-
-    ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, *dev_workspace, u, tmp, system->n);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-
-    copy_host_device( output, tmp, sizeof (real) * system->n, 
-            cudaMemcpyDeviceToHost, "charges:q" );
-}
-//TODO
-//TODO
-//TODO
-//TODO
-//TODO
-//TODO
-//TODO
-
-
-CUDA_GLOBAL void ker_update_q(reax_atom *my_atoms, real *q, int n, int N)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= (N-n))
-    {
-        return;
-    }
-
-    //for( i = system->n; i < system->N; ++i )
-    my_atoms[i + n].q = q[i + n];
-}
-
-
-void cuda_charges_updateq(reax_system *system, real *q) 
-{
-    int blocks;
-    real *dev_q = (real *) scratch;
-
-    copy_host_device( q, dev_q, system->N * sizeof (real),
-            cudaMemcpyHostToDevice, "charges:q" );
-    blocks = (system->N - system->n) / DEF_BLOCK_SIZE +
-        (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-
-    ker_update_q <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, dev_q, system->n, system->N);
-
-    cudaThreadSynchronize();
-    cudaCheckError();
-}
diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda_reset_tools.cu
deleted file mode 100644
index 850a7c5d30fb1a8f6bd2a5db0020bcc2220ac898..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_reset_tools.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-
-#include "cuda_reset_tools.h"
-#include "cuda_utils.h"
-#include "dev_list.h"
-
-CUDA_GLOBAL void ker_reset_hbond_list (reax_atom *my_atoms, 
-        reax_list hbonds, 
-        int N)
-{
-    int Hindex = 0;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    Hindex = my_atoms[i].Hindex;
-    if (Hindex > 1) {
-        Dev_Set_End_Index ( Hindex, Dev_Start_Index (Hindex, &hbonds), &hbonds);
-    }
-}
-
-CUDA_GLOBAL void ker_reset_bond_list (reax_atom *my_atoms, 
-        reax_list bonds, 
-        int N)
-{
-    int Hindex = 0;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    Dev_Set_End_Index ( i, Dev_Start_Index (i, &bonds), &bonds);
-}
-
-extern "C"
-{
-
-    void Cuda_Reset_Workspace (reax_system *system, storage *workspace)
-    {
-        cuda_memset ( dev_workspace->total_bond_order, 0, system->total_cap * sizeof (real), "total_bond_order");
-        cuda_memset ( dev_workspace->dDeltap_self, 0, system->total_cap * sizeof (rvec), "dDeltap_self");
-        cuda_memset ( dev_workspace->CdDelta, 0, system->total_cap * sizeof (real), "CdDelta");
-        cuda_memset ( dev_workspace->f, 0, system->total_cap * sizeof (rvec), "f");
-    }
-
-    CUDA_GLOBAL void ker_reset_hindex (reax_atom *my_atoms, int N)
-    {
-        int Hindex = 0;
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= N) return;
-
-        my_atoms[i].Hindex = i;
-    }
-
-    void Cuda_Reset_Atoms( reax_system* system, control_params *control )
-    {
-        int i;
-        reax_atom *atom;
-        int blocks;
-
-        /*
-           if( control->hbond_cut > 0 ) 
-        //TODO
-        for( i = 0; i < system->N; ++i ) { 
-        atom = &(system->my_atoms[i]);
-        //if( system->reax_param.sbp[ atom->type ].p_hbond == 1 ) 
-        atom->Hindex = system->numH++;
-        //else atom->Hindex = -1; 
-        }   
-        //TODO
-         */
-        ////////////////////////////////
-        ////////////////////////////////
-        ////////////////////////////////
-        ////////////////////////////////
-        // FIX - 3 - Commented out this line for Hydrogen Bond fix
-        // FIX - HBOND ISSUE
-        // FIX - HBOND ISSUE
-        // FIX - HBOND ISSUE
-        // COMMENTED OUT THIS LINE BELOW
-        //system->numH = system->N;
-        // FIX - HBOND ISSUE
-        // FIX - HBOND ISSUE
-        // FIX - HBOND ISSUE
-        ////////////////////////////////
-        ////////////////////////////////
-        ////////////////////////////////
-        ////////////////////////////////
-        ////////////////////////////////
-
-
-        blocks = system->N / DEF_BLOCK_SIZE + 
-            ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-        ker_reset_hindex <<<blocks, DEF_BLOCK_SIZE>>>
-            (system->d_my_atoms, system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-
-    }
-
-    int Cuda_Reset_Neighbor_Lists( reax_system *system, control_params *control,
-            storage *workspace, reax_list **lists )
-    {
-        int i, total_bonds, Hindex, total_hbonds;
-        reax_list *bonds, *hbonds;
-        int blocks;
-
-        if (system->N > 0) {
-            bonds = *dev_lists + BONDS;
-            total_bonds = 0;
-
-            //cuda_memset (bonds->index, 0, sizeof (int) * system->total_cap, "bonds:index");
-            //cuda_memset (bonds->end_index, 0, sizeof (int) * system->total_cap, "bonds:end_index");
-            blocks = system->N / DEF_BLOCK_SIZE + 
-                ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-            ker_reset_bond_list <<<blocks, DEF_BLOCK_SIZE>>>
-                (system->d_my_atoms, *(*dev_lists + BONDS), system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-
-            total_bonds = 0;// TODO compute the total bonds here.
-
-            /* is reallocation needed? */
-            if( total_bonds >= bonds->num_intrs * DANGER_ZONE ) { 
-                workspace->realloc.bonds = 1;
-                if( total_bonds >= bonds->num_intrs ) { 
-                    fprintf(stderr, "p%d: not enough space for bonds! total=%d allocated=%d\n", 
-                            system->my_rank, total_bonds, bonds->num_intrs );
-                    return FAILURE;
-                }   
-            }   
-        }
-
-        //HBonds processing
-        //FIX - 4 - Added additional check
-        if( (control->hbond_cut > 0) && (system->numH > 0)) { 
-            hbonds = (*dev_lists) + HBONDS;
-            total_hbonds = 0;
-
-            /* reset start-end indexes */
-            //TODO
-            blocks = system->N / DEF_BLOCK_SIZE + 
-                ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-            ker_reset_hbond_list <<<blocks, DEF_BLOCK_SIZE>>>
-                (system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-
-            //TODO compute the total hbonds here
-            total_hbonds = 0;
-
-            /* is reallocation needed? */
-            if( total_hbonds >= hbonds->num_intrs * 0.90/*DANGER_ZONE*/ ) { 
-                workspace->realloc.hbonds = 1;
-                if( total_hbonds >= hbonds->num_intrs ) {
-                    fprintf(stderr, "p%d: not enough space for hbonds! total=%d allocated=%d\n",
-                            system->my_rank, total_hbonds, hbonds->num_intrs );
-                    return FAILURE;
-                }
-            }
-        }
-
-        return SUCCESS;
-    }
-
-}
diff --git a/PG-PuReMD/src/cuda_reset_tools.h b/PG-PuReMD/src/cuda_reset_tools.h
deleted file mode 100644
index c2d969ab3b387b5614b0c3ac3b42dbe3b44c6c88..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_reset_tools.h
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#ifndef __CUDA_RESET_TOOLS_H__
-#define __CUDA_RESET_TOOLS_H__
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-void Cuda_Reset_Workspace (reax_system *, storage *);
-void Cuda_Reset_Atoms (reax_system *, control_params *);
-int  Cuda_Reset_Neighbor_Lists (reax_system *, control_params *, storage *, reax_list **);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda_utils.cu
deleted file mode 100644
index 2aa875747fc708080d8cfe1692b86cb8738ecca7..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_utils.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "cuda_utils.h"
-
-
-extern "C" void cuda_malloc(void **ptr, int size, int mem_set, const char *msg)
-{
-
-    cudaError_t retVal = cudaSuccess;
-
-    retVal = cudaMalloc( ptr, size );
-
-    if( retVal != cudaSuccess )
-    {
-        fprintf( stderr, "Failed to allocate memory on device for the res: %s...  exiting with code: %d size: %d \n", 
-                msg, retVal, size );
-        exit (-1);
-    }  
-
-    if( mem_set )
-    {
-        retVal = cudaMemset( *ptr, 0, size );
-
-        if( retVal != cudaSuccess )
-        {
-            fprintf( stderr, "Failed to memset memory on device for resource %s\n", 
-                    msg );
-            exit( -1 );
-        }
-    }  
-}
-
-
-extern "C" void cuda_free(void *ptr, const char *msg)
-{
-
-    cudaError_t retVal = cudaSuccess;
-
-    if ( !ptr )
-    {
-        return;
-    }  
-
-    retVal = cudaFree( ptr );
-
-    if( retVal != cudaSuccess )
-    {
-        fprintf( stderr, "Failed to release memory on device for res %s... exiting with code %d -- Address %ld\n", 
-                msg, retVal, (long int) ptr );
-        return;
-    }  
-}
-
-
-extern "C" void cuda_memset(void *ptr, int data, size_t count, const char *msg){
-    cudaError_t retVal = cudaSuccess;
-
-    retVal = cudaMemset( ptr, data, count );
-
-    if( retVal != cudaSuccess )
-    {
-        fprintf( stderr, "Failed to memset memory on device for %s, cuda code %d\n", 
-                msg, retVal );
-        exit( -1 );
-    }
-}
-
-
-extern "C" void copy_host_device(void *host, void *dev, int size, enum cudaMemcpyKind dir, const char *msg)
-{
-    cudaError_t retVal = cudaErrorNotReady;
-
-    if( dir == cudaMemcpyHostToDevice )
-    {
-        retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice );
-    }
-    else
-    {
-        retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost );
-    }
-
-    if( retVal != cudaSuccess )
-    {
-        fprintf( stderr, "could not copy resource %s from host to device: reason %d \n",
-                msg, retVal );
-        exit( -1 );
-    }
-}
-
-
-extern "C" void copy_device(void *dest, void *src, int size, const char *msg)
-{
-    cudaError_t retVal = cudaErrorNotReady;
-
-    retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
-    if( retVal != cudaSuccess )
-    {
-        fprintf( stderr, "could not copy resource %s from device to device: reason %d \n",
-                msg, retVal );
-        exit( -1 );
-    }
-}
-
-
-extern "C" void compute_blocks( int *blocks, int *block_size, int count )
-{
-    *block_size = CUDA_BLOCK_SIZE;
-    *blocks = (int) CEIL((double) count / CUDA_BLOCK_SIZE);
-}
-
-
-extern "C" void compute_matvec_blocks( int *blocks, int count )
-{
-
-    *blocks = (int) CEIL((double) count * MATVEC_KER_THREADS_PER_ROW / MATVEC_BLOCK_SIZE);
-}
-
-
-extern "C" void compute_nearest_pow_2(int blocks, int *result)
-{
-
-  *result = (int) EXP2( CEIL( LOG2((double) blocks) ) );
-}
-
-
-extern "C" void print_device_mem_usage()
-{
-    size_t total, free;
-
-    cudaMemGetInfo( &free, &total );
-
-    if ( cudaGetLastError() != cudaSuccess )
-    {
-        fprintf( stderr, "Error on the memory call \n" );
-        return;
-    }
-
-    fprintf( stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
-            total, total/(1024*1024), total/ (1024*1024*1024), 
-            free, free/(1024*1024), free/ (1024*1024*1024) );
-}
diff --git a/PG-PuReMD/src/cuda_utils.h b/PG-PuReMD/src/cuda_utils.h
deleted file mode 100644
index f4c6ae2e0049b4af910d8191366de08ac0f79732..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_utils.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef __CUDA_UTILS_H_
-#define __CUDA_UTILS_H_
-
-#include "cuda.h"
-#include "cuda_runtime.h"
-#include "stdlib.h"
-#include "stdio.h"
-
-#include "reax_types.h"
-
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-void compute_blocks(int *, int *, int);
-void compute_nearest_pow_2(int blocks, int *result);
-void compute_matvec_blocks(int *, int);
-
-void cuda_malloc(void **, int , int , const char *);
-void cuda_free(void *, const char *);
-void cuda_memset(void *, int , size_t , const char *);
-void copy_host_device(void *, void *, int , enum cudaMemcpyKind, const char *);
-void copy_device(void *, void *, int , const char *);
-
-void print_device_mem_usage();
-
-#define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
-inline void __cudaCheckError( const char *file, const int line )
-{
-    cudaError err = cudaGetLastError();
-    if ( cudaSuccess != err )
-    {
-        fprintf( stderr, "Failed .. %s:%d -- gpu erro code %d\n", file, line, err );
-        exit( -1 );
-    }
-
-    // More careful checking. However, this will affect performance.
-    // Comment away if needed.
-    /*
-    err = cudaDeviceSynchronize();
-    if( cudaSuccess != err )
-    {
-       exit( -1 );
-    }
-    */
-
-    return;
-}
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu
deleted file mode 100644
index df3be3512567403386d2db009423bbcd7dd21492..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ /dev/null
@@ -1,652 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-  
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_valence_angles.h"
-
-#include "index_utils.h"
-#include "dev_list.h"
-#include "vector.h"
-
-/* this is a 3-body interaction in which the main role is 
-   played by j which sits in the middle of the other two. */
-CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, 
-													global_parameters gp, 
-													single_body_parameters *sbp, 
-													three_body_header *d_thbh, 
-													control_params *control, 
-													storage p_workspace, 
-													reax_list p_bonds, reax_list p_thb_intrs, 
-													int n, int N, int num_atom_types, 
-													real *data_e_ang, real *data_e_pen, real *data_e_coa, 
-													rvec *my_ext_press
-													)
-{
-  int i, j, pi, k, pk, t;
-  int type_i, type_j, type_k;
-  int start_j, end_j, start_pk, end_pk;
-  int cnt, num_thb_intrs;
-
-  real temp, temp_bo_jt, pBOjt7;
-  real p_val1, p_val2, p_val3, p_val4, p_val5;
-  real p_val6, p_val7, p_val8, p_val9, p_val10;
-  real p_pen1, p_pen2, p_pen3, p_pen4;
-  real p_coa1, p_coa2, p_coa3, p_coa4;
-  real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-  real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-  real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
-  real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-  real CEpen1, CEpen2, CEpen3;
-  real e_ang, e_coa, e_pen;
-  real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-  real Cf7ij, Cf7jk, Cf8j, Cf9j;
-  real f7_ij, f7_jk, f8_Dj, f9_Dj;
-  real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-  real r_ij, r_jk;
-  real BOA_ij, BOA_jk;
-  rvec force, ext_press;
-  // rtensor temp_rtensor, total_rtensor;
-
-  three_body_header *thbh;
-  three_body_parameters *thbp;
-  three_body_interaction_data *p_ijk, *p_kji;
-  bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-  bond_order_data *bo_ij, *bo_jk, *bo_jt;
-
-  reax_list *bonds = &( p_bonds );
-  reax_list *thb_intrs =  &( p_thb_intrs );
-  storage *workspace = &( p_workspace );
-
-  /* global parameters used in these calculations */
-  p_val6 = gp.l[14];
-  p_val8 = gp.l[33];
-  p_val9 = gp.l[16];
-  p_val10 = gp.l[17];
-
-  j = blockIdx.x * blockDim.x + threadIdx.x;
-  if (j >= N) return;
-	
-
-
-
-
-  //num_thb_intrs = j * THREE_BODY_OFFSET;
-  
-  //for( j = 0; j < system->N; ++j ) {
-    
-    type_j = my_atoms[j].type;
-    start_j = Dev_Start_Index(j, bonds);
-    end_j = Dev_End_Index(j, bonds);
-  
-    p_val3 = sbp[ type_j ].p_val3;
-    p_val5 = sbp[ type_j ].p_val5;
-    
-    SBOp = 0, prod_SBO = 1;
-    for( t = start_j; t < end_j; ++t ) {
-      bo_jt = &(bonds->select.bond_list[t].bo_data);
-      SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-      temp = SQR( bo_jt->BO );
-      temp *= temp; 
-      temp *= temp;
-      prod_SBO *= EXP( -temp );
-    }
-    
-    /* modifications to match Adri's code - 09/01/09 */
-    if( workspace->vlpex[j] >= 0 ){
-      vlpadj = 0;
-      dSBO2 = prod_SBO - 1;
-    }
-    else{
-      vlpadj = workspace->nlp[j];
-      dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-    }
-
-    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-      
-    if( SBO <= 0 )
-      SBO2 = 0, CSBO2 = 0;
-    else if( SBO > 0 && SBO <= 1 ) {
-	SBO2 = POW( SBO, p_val9 );
-	CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-    }
-    else if( SBO > 1 && SBO < 2 ) {
-      SBO2 = 2 - POW( 2-SBO, p_val9 );
-      CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-    }
-    else 
-      SBO2 = 2, CSBO2 = 0;  
-    
-    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );    
-    
-    for( pi = start_j; pi < end_j; ++pi ) {
-
-	// 	num_thb_intrs = pi * THREE_BODY_OFFSET;
-      //Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-		num_thb_intrs = Dev_Start_Index (pi, thb_intrs);
-
-      pbond_ij = &(bonds->select.bond_list[pi]);
-      bo_ij = &(pbond_ij->bo_data);
-      BOA_ij = bo_ij->BO - control->thb_cut;
-      
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-      
-      if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
-	  ( j < n || pbond_ij->nbr < n ) ) {
-//      if( BOA_ij/*bo_ij->BO*/ > 0.0) {
-	i = pbond_ij->nbr;
-	r_ij = pbond_ij->d;	 
-	type_i = my_atoms[i].type;
-	// fprintf( out_control->eval, "i: %d\n", i );
-	
-	
-	/* first copy 3-body intrs from previously computed ones where i>k.
-	   in the second for-loop below, 
-	   we compute only new 3-body intrs where i < k */
-	/*
-
-
-	// The copy loop commented out because strange asynchronous issues started to surface
-	// Each kernel now manually generates everything
-
-	
-	for( pk = start_j; pk < pi; ++pk ) {
-		
-	//printf("%d,%d \n", j, pk );
-		
-
-	  // fprintf( out_control->eval, "pk: %d\n", pk );
-	  start_pk = Dev_Start_Index( pk, thb_intrs );
-	  end_pk = Dev_End_Index( pk, thb_intrs );
-		  
-	  for( t = start_pk; t < end_pk; ++t )
-	    if( thb_intrs->select.three_body_list[t].thb == i ) {
-	      p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
-	      p_kji = &(thb_intrs->select.three_body_list[t]);
-	      
-	      p_ijk->thb = bonds->select.bond_list[pk].nbr;
-	      p_ijk->pthb  = pk;
-	      p_ijk->theta = p_kji->theta;			  
-	      rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-	      rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-	      rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-	      
-	      ++num_thb_intrs;
-		printf("\n");
-	      break;
-	    }
-	}
-	*/
-
-	/* and this is the second for loop mentioned above */
-	//for( pk = pi+1; pk < end_j; ++pk ) {
-	
-	// Except that now the loop goes all the way from start_j to end_j
-	for( pk = start_j; pk < end_j; ++pk ) {
-		if (pk == pi) continue;
-
-	
-	  pbond_jk = &(bonds->select.bond_list[pk]);
-	  bo_jk    = &(pbond_jk->bo_data);
-	  BOA_jk   = bo_jk->BO - control->thb_cut;
-	  k        = pbond_jk->nbr;
-	  type_k   = my_atoms[k].type;
-	  p_ijk    = &( thb_intrs->select.three_body_list[num_thb_intrs] );
-
-	  //CHANGE ORIGINAL
-	  //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
-	  if ((BOA_jk <= 0) ) continue;
-	  //CHANGE ORIGINAL
-	  
-	  Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-			   pbond_jk->dvec, pbond_jk->d,
-			   &theta, &cos_theta );
-	  
-	  Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-				pbond_jk->dvec, pbond_jk->d, 
-				&(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-				&(p_ijk->dcos_dk) );
-	  p_ijk->thb = k;
-	  p_ijk->pthb = pk;
-	  p_ijk->theta = theta;
-	  
-	  sin_theta = SIN( theta );
-	  if( sin_theta < 1.0e-5 )
-	    sin_theta = 1.0e-5;
-
-	  ++num_thb_intrs;
-	 
-	  
-	  if( (j < n) && (BOA_jk > 0.0) && 
-	      (bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/) ) {
-	    r_jk = pbond_jk->d;		      
-	    thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
-	    
-	    /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id )
-	       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-	       system->my_atoms[i].orig_id, 
-	       system->my_atoms[j].orig_id, 
-	       system->my_atoms[k].orig_id,
-	       bo_ij->BO, bo_jk->BO, p_ijk->theta );
-	       else 
-	       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-	       system->my_atoms[k].orig_id,
-	       system->my_atoms[j].orig_id, 
-	       system->my_atoms[i].orig_id, 
-	       bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-	    
-	    for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-	      // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", 
-	      //          i+1, j+1, k+1 );
-
-	      if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-		thbp = &( thbh->prm[cnt] );			     
-		
-		/* ANGLE ENERGY */
-		p_val1 = thbp->p_val1;
-		p_val2 = thbp->p_val2;
-		p_val4 = thbp->p_val4;
-		p_val7 = thbp->p_val7;
-		theta_00 = thbp->theta_00;
-		
-		exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-		f7_ij = 1.0 - exp3ij;
-		Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-		
-		exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-		f7_jk = 1.0 - exp3jk;
-		Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-		
-		expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-		trm8 = 1.0 + expval6 + expval7;
-		f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-		Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-		  ( p_val6 * expval6 * trm8 - 
-		    (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) );
-		
-		theta_0 = 180.0 - theta_00 * (1.0 - 
-					      EXP(-p_val10 * (2.0 - SBO2)));
-		theta_0 = DEG2RAD( theta_0 );		      
-		
-		expval2theta  = EXP( -p_val2 * SQR(theta_0 - theta) );
-		if( p_val1 >= 0 ) 
-		  expval12theta = p_val1 * (1.0 - expval2theta);
-		else // To avoid linear Me-H-Me angles (6/6/06)
-		  expval12theta = p_val1 * -expval2theta;
-		
-		CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-		CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-		CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-		CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-		  expval2theta * (theta_0 - theta);
-		
-		Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-		  exp( -p_val10 * (2.0 - SBO2) );
-		
-		CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-		CEval6 = CEval5 * dSBO1;
-		CEval7 = CEval5 * dSBO2;
-		CEval8 = -CEval4 / sin_theta;
-		
-		if(pk<pi){
-		
-		data_e_ang [j] += e_ang = 
-		   f7_ij * f7_jk * f8_Dj * expval12theta;
-
-		}
-		/* END ANGLE ENERGY*/
-		
-		
-		/* PENALTY ENERGY */
-		p_pen1 = thbp->p_pen1;
-		p_pen2 = gp.l[19];
-		p_pen3 = gp.l[20];
-		p_pen4 = gp.l[21];
-		
-		exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-		exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-		exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-		exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-		trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-		f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-		Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - 
-			 (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + 
-					      p_pen4 * exp_pen4 ) ) / 
-		  SQR( trm_pen34 );
-
-		// These if(pk<pi) are very important. They become necessary due to each kernel generating all interactions. 
-		// To prevent all these energies becoming duplicates, we only continue if pk<pi
-	
-		if(pk<pi){
-			data_e_pen [j] += e_pen = 
-			  p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-		}
-
-		CEpen1 = e_pen * Cf9j / f9_Dj;
-		temp   = -2.0 * p_pen2 * e_pen;
-		CEpen2 = temp * (BOA_ij - 2.0);
-		CEpen3 = temp * (BOA_jk - 2.0);
-		/* END PENALTY ENERGY */
-		
-		
-		/* COALITION ENERGY */
-		p_coa1 = thbp->p_coa1;
-		p_coa2 = gp.l[2];
-		p_coa3 = gp.l[38];
-		p_coa4 = gp.l[30];
-		
-		exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-
-		// Same here again, check if pk<pi
-		if(pk<pi){
-
-		data_e_coa [j] += e_coa = 
-		  p_coa1 / (1. + exp_coa2) *
-		  EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
-		  EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
-		  EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-		  EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-		}
-
-		CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-		CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-		CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
-		CEcoa4 = -2 * p_coa3 * 
-		  (workspace->total_bond_order[i]-BOA_ij) * e_coa;
-		CEcoa5 = -2 * p_coa3 * 
-		  (workspace->total_bond_order[k]-BOA_jk) * e_coa;
-		/* END COALITION ENERGY */
-		
-		/* FORCES */
-		// we must again check for pk<pi for entire forces part
-		if(pk<pi){
-			/*
-			bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-			bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-			workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-			workspace->CdDelta[i] += CEcoa4;
-			workspace->CdDelta[k] += CEcoa5;		      
-			*/
-			bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-			bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-			workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-			pbond_ij->va_CdDelta += CEcoa4;
-			pbond_jk->va_CdDelta += CEcoa5;
-
-		
-			for( t = start_j; t < end_j; ++t ) {
-			    pbond_jt = &( bonds->select.bond_list[t] );
-			    bo_jt = &(pbond_jt->bo_data);
-			    temp_bo_jt = bo_jt->BO;
-			    temp = CUBE( temp_bo_jt );
-			    pBOjt7 = temp * temp * temp_bo_jt; 
-			    
-			    // fprintf( out_control->eval, "%6d%12.8f\n", 
-			    // workspace->reverse_map[bonds->select.bond_list[t].nbr],
-			    // (CEval6 * pBOjt7) );
-			    
-			    bo_jt->Cdbo += (CEval6 * pBOjt7);
-			    bo_jt->Cdbopi += CEval5;
-			    bo_jt->Cdbopi2 += CEval5;
-			}		      
-		
-		
-			if( control->virial == 0 ) {
-			/*
-			  rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
-			  rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-			  rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
-			 */
-
-			  rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
-			  rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-			  rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
-			}
-			else {
-			  /* terms not related to bond order derivatives are
-			     added directly into forces and pressure vector/tensor */
-			  rvec_Scale( force, CEval8, p_ijk->dcos_di );
-			  //rvec_Add( workspace->f[i], force );
-			  rvec_Add( pbond_ij->va_f, force );
-			  rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-			  //rvec_Add( data->my_ext_press, ext_press );
-			  rvec_Add( my_ext_press [j], ext_press );
-		  
-			  rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-		  
-			  rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-			  //rvec_Add( workspace->f[k], force );
-			  rvec_Add( pbond_jk->va_f, force );
-			  rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-			  rvec_Add( my_ext_press [j], ext_press );
-			}
-		}
-#ifdef TEST_ENERGY
-		/*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-		  p_val3, p_val4, BOA_ij, BOA_jk );
-		fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n",
-			workspace->Delta_e[j], workspace->vlpex[j],
-			dSBO1, dSBO2, vlpadj );
-		fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-			 f7_ij, f7_jk, f8_Dj, expval12theta );
-		fprintf( out_control->eval, 
-			 "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-			 CEval1, CEval2, CEval3, CEval4, 
-			 CEval5, CEval6, CEval7, CEval8 );
-		
-		fprintf( out_control->eval, 
-		"%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-		   p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta,
-		   p_ijk->dcos_di[2]/sin_theta, 
-		   p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta,
-		   p_ijk->dcos_dj[2]/sin_theta, 
-		   p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta,
-		   p_ijk->dcos_dk[2]/sin_theta);
-		
-		fprintf( out_control->eval, 
-			 "%6d%6d%6d%15.8f%15.8f\n",
-			 system->my_atoms[i].orig_id, 
-			 system->my_atoms[j].orig_id, 
-			 system->my_atoms[k].orig_id,
-			 RAD2DEG(theta), e_ang );*/
-
-		fprintf( out_control->eval, 
-		//"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-			 "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-			 system->my_atoms[i].orig_id, 
-			 system->my_atoms[j].orig_id, 
-			 system->my_atoms[k].orig_id,
-			 RAD2DEG(theta), theta_0, BOA_ij, BOA_jk,
-			 e_ang, data->my_en.e_ang );
-		
-		fprintf( out_control->epen, 
-			 //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-			 "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-			 system->my_atoms[i].orig_id,
-			 system->my_atoms[j].orig_id,
-			 system->my_atoms[k].orig_id,
-			 RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, 
-			 data->my_en.e_pen );
-		
-		fprintf( out_control->ecoa, 
-			 //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-			 "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-			 system->my_atoms[i].orig_id, 
-			 system->my_atoms[j].orig_id, 
-			 system->my_atoms[k].orig_id,
-			 RAD2DEG(theta), BOA_ij, BOA_jk, 
-			 e_coa, data->my_en.e_coa );
-#endif
-
-#ifdef TEST_FORCES            /* angle forces */
-		Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-		Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-		Add_dDelta( system, lists, j, 
-			    CEval3 + CEval7, workspace->f_ang );
-		
-		for( t = start_j; t < end_j; ++t ) {
-		  pbond_jt = &( bonds->select.bond_list[t] );
-		  bo_jt = &(pbond_jt->bo_data);
-		  temp_bo_jt = bo_jt->BO;
-		  temp = CUBE( temp_bo_jt );
-		  pBOjt7 = temp * temp * temp_bo_jt; 
-		  
-		  Add_dBO( system, lists, j, t, pBOjt7 * CEval6, 
-			   workspace->f_ang );
-		  Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, 
-				 workspace->f_ang, workspace->f_ang );
-		}
-		
-		rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-		rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-		rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-		/* end angle forces */
-		
-		/* penalty forces */
-		Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-		Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-		Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-		/* end penalty forces */
-		
-		/* coalition forces */
-		Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, 
-			 workspace->f_coa );
-		Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, 
-			 workspace->f_coa );
-		Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-		Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-		Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-		/* end coalition forces */
-#endif
-	      }
-	    }
-	  }
-	}
-      }
-      //printf("\n"); 
-      Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs );
-    }
-//	if(j==0){
-  //      printf("%d\n",thb_intrs->num_intrs);
- // }
-
-  // } CUDA Commented
-}
-
-CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess (   reax_atom *atoms, control_params *control,
-                                                   storage p_workspace, 
-                                                   reax_list p_bonds, int N )
-{
-  int i, pj;
-
-  bond_data *pbond;
-  bond_data *sym_index_bond;
-  reax_list *bonds = &p_bonds;
-  storage *workspace = &p_workspace;
-
-  i = blockIdx.x * blockDim.x + threadIdx.x;
-  if ( i >= N) return;
-
-  for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
-
-      pbond = &(bonds->select.bond_list[pj]);
-      sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
-
-      workspace->CdDelta [i] += sym_index_bond->va_CdDelta;
-
-      //rvec_Add (atoms[i].f, sym_index_bond->va_f );
-      rvec_Add (workspace->f[i], sym_index_bond->va_f );
-  }
-}
-
-
-// THREE BODY ESTIMATION HERE
-CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, 
-													control_params *control, 
-													reax_list p_bonds, 
-													int n, int N, 
-													int *count
-													)
-{
-  int i, j, pi, k, pk, t;
-  int type_i, type_j, type_k;
-  int start_j, end_j;
-  int cnt, num_thb_intrs;
-
-  real r_ij, r_jk;
-  real BOA_ij, BOA_jk;
-
-  bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-  bond_order_data *bo_ij, *bo_jk, *bo_jt;
-
-  reax_list *bonds = &( p_bonds );
-
-  j = blockIdx.x * blockDim.x + threadIdx.x;
-  if (j >= N) return;
-	//printf("\n");
-    type_j = my_atoms[j].type;
-    start_j = Dev_Start_Index(j, bonds);
-    end_j = Dev_End_Index(j, bonds);
-    
-
-    for( pi = start_j; pi < end_j; ++pi ) {
-
-	 	num_thb_intrs = 0;
-		count[ pi ] = 0;
-
-      pbond_ij = &(bonds->select.bond_list[pi]);
-      bo_ij = &(pbond_ij->bo_data);
-      BOA_ij = bo_ij->BO - control->thb_cut;
-      
-      
-      if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
-	  ( j < n || pbond_ij->nbr < n ) ) {
-      //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
-	i = pbond_ij->nbr;
-	r_ij = pbond_ij->d;	 
-	type_i = my_atoms[i].type;
-	
-	
-	for( pk = start_j; pk < end_j; ++pk ) {
-		if (pk == pi) continue;
-
-	  pbond_jk = &(bonds->select.bond_list[pk]);
-	  bo_jk    = &(pbond_jk->bo_data);
-	  BOA_jk   = bo_jk->BO - control->thb_cut;
-
-	  //CHANGE ORIGINAL
-	  //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
-	  if ((BOA_jk <= 0) ) continue;
-	  //CHANGE ORIGINAL
-	  
-	  ++num_thb_intrs;
-	  
-	}
-
-      }
-      
-		count[ pi ] = num_thb_intrs;
-    }
-}
-
diff --git a/PG-PuReMD/src/dev_alloc.cu b/PG-PuReMD/src/dev_alloc.cu
deleted file mode 100644
index b0a76a2168a4213adc4be86ba3b62f3a72fb1b65..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dev_alloc.cu
+++ /dev/null
@@ -1,410 +0,0 @@
-
-#include "dev_alloc.h"
-#include "cuda_utils.h"
-
-#include "vector.h"
-
-extern "C"
-{
-
-    int dev_alloc_control (control_params *control)
-    {
-        cuda_malloc ((void **)&control->d_control_params, sizeof (control_params), 1, "control_params");
-        copy_host_device (control, control->d_control_params, sizeof (control_params), cudaMemcpyHostToDevice, "control_params");
-    }
-
-    CUDA_GLOBAL void Init_Nbrs(ivec *nbrs, int N)
-    {
-        int index = blockIdx.x * blockDim.x + threadIdx.x;
-        if (index >= N) return;
-
-        nbrs[index][0] = -1; 
-        nbrs[index][1] = -1; 
-        nbrs[index][2] = -1; 
-    }
-
-
-    int dev_alloc_grid (reax_system *system)
-    {
-        int total;
-        grid_cell local_cell;
-        grid *host = &system->my_grid;
-        grid *device = &system->d_my_grid;
-        ivec *nbrs_x = (ivec *) scratch;
-
-        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-        ivec_Copy (device->ncells, host->ncells);
-        rvec_Copy (device->cell_len, host->cell_len);
-        rvec_Copy (device->inv_len, host->inv_len);
-
-        ivec_Copy (device->bond_span, host->bond_span );
-        ivec_Copy (device->nonb_span, host->nonb_span );
-        ivec_Copy (device->vlist_span, host->vlist_span );
-
-        ivec_Copy (device->native_cells, host->native_cells );
-        ivec_Copy (device->native_str, host->native_str );
-        ivec_Copy (device->native_end, host->native_end );
-
-        device->ghost_cut = host->ghost_cut;
-        ivec_Copy (device->ghost_span, host->ghost_span );
-        ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
-        ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
-        ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
-
-        cuda_malloc ((void **) &device->str, sizeof (int) * total, 1, "grid:str");
-        cuda_malloc ((void **) &device->end, sizeof (int) * total, 1, "grid:end");
-        cuda_malloc ((void **) &device->cutoff, sizeof (real) * total, 1, "grid:cutoff");
-        cuda_malloc ((void **) &device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, 1, "grid:nbrs_x");
-        cuda_malloc ((void **) &device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, 1, "grid:nbrs_cp");
-        cuda_malloc ((void **) &device->rel_box, sizeof (ivec) * total, 1, "grid:rel_box");
-
-        /*
-           int block_size = 512;
-           int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
-
-           Init_Nbrs <<<blocks, block_size>>>
-           (nbrs_x, host->max_nbrs );
-           cudaThreadSynchronize (); 
-           cudaCheckError ();
-
-           cuda_malloc ((void **)& device->cells, 
-           sizeof (grid_cell) * total, 
-           1, "grid:cells");
-           fprintf (stderr, " Device cells address --> %ld \n", device->cells );
-           cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), 1, "grid:order");
-
-           local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
-           fprintf (stderr, "Total cells to be allocated -- > %d \n", total );
-           for (int i = 0; i < total; i++) {
-        //fprintf (stderr, "Address of the local atom -> %ld  \n", &local_cell);
-
-        cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, 
-        1, "alloc:grid:cells:atoms");
-        //fprintf (stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
-
-        cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, 
-        1, "alloc:grid:cells:nbrs_x" );
-        copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x");    
-        //fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x);
-
-        cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, 
-        1, "alloc:grid:cells:nbrs_cp" );
-        //fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp);
-
-        //cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , 
-        //                1, "alloc:grid:cells:nbrs" );
-        //fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs);
-
-        copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc");
-        }
-         */
-
-        return SUCCESS;
-    }
-
-    int dev_dealloc_grid_cell_atoms (reax_system *system)
-    {
-        int total;
-        grid_cell local_cell;
-        grid *host = &system->my_grid;
-        grid *device = &system->d_my_grid;
-
-        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-
-        for (int i = 0; i < total; i++) {
-            copy_host_device (&local_cell, &device->cells[i], 
-                    sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
-            cuda_free (local_cell.atoms, "grid_cell:atoms" );
-        }
-    }
-
-    int dev_alloc_grid_cell_atoms (reax_system *system, int cap)
-    {
-        int total;
-        grid_cell local_cell;
-        grid *host = &system->my_grid;
-        grid *device = &system->d_my_grid;
-
-        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-        for (int i = 0; i < total; i++) {
-            copy_host_device (&local_cell, &device->cells[i], 
-                    sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
-            cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * cap, 
-                    1, "realloc:grid:cells:atoms");
-            copy_host_device (&local_cell, &device->cells[i], 
-                    sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc");
-        }
-    }
-
-
-    int dev_alloc_system (reax_system *system)
-    {
-        cuda_malloc ( (void **) &system->d_my_atoms, system->total_cap * sizeof (reax_atom), 1, "system:d_my_atoms");  
-        //fprintf (stderr, "p:%d - allocated atoms : %d (%ld, %ld) \n", system->my_rank, system->total_cap, 
-        //                                                                                    system->my_atoms, system->d_my_atoms);
-
-        //simulation boxes
-        cuda_malloc ( (void **) &system->d_big_box, sizeof (simulation_box), 1, "system:d_big_box");
-        cuda_malloc ( (void **) &system->d_my_box, sizeof (simulation_box), 1, "system:d_my_box");
-        cuda_malloc ( (void **) &system->d_my_ext_box, sizeof (simulation_box), 1, "d_my_ext_box");
-
-        //interaction parameters
-        cuda_malloc ((void **) &system->reax_param.d_sbp, system->reax_param.num_atom_types * sizeof (single_body_parameters),
-                1, "system:d_sbp");
-
-        cuda_malloc ((void **) &system->reax_param.d_tbp, pow (system->reax_param.num_atom_types, 2) * sizeof (two_body_parameters), 
-                1, "system:d_tbp");
-
-        cuda_malloc ((void **) &system->reax_param.d_thbp, pow (system->reax_param.num_atom_types, 3) * sizeof (three_body_header),
-                1, "system:d_thbp");
-
-        cuda_malloc ((void **) &system->reax_param.d_hbp, pow (system->reax_param.num_atom_types, 3) * sizeof (hbond_parameters),
-                1, "system:d_hbp");
-
-        cuda_malloc ((void **) &system->reax_param.d_fbp, pow (system->reax_param.num_atom_types, 4) * sizeof (four_body_header),
-                1, "system:d_fbp");
-
-        cuda_malloc ((void **) &system->reax_param.d_gp.l, system->reax_param.gp.n_global * sizeof (real), 1, "system:d_gp.l");
-
-        system->reax_param.d_gp.n_global = 0;
-        system->reax_param.d_gp.vdw_type = 0;
-
-        return SUCCESS;
-    }
-
-    int dev_realloc_system (reax_system *system, int local_cap, int total_cap, char *msg)
-    {
-        //free the existing storage for atoms
-        cuda_free (system->d_my_atoms, "system:d_my_atoms");
-
-        cuda_malloc ((void **) &system->d_my_atoms, sizeof (reax_atom) * total_cap, 
-                1, "system:d_my_atoms");
-        return FAILURE;
-    }
-
-
-    int dev_alloc_simulation_data(simulation_data *data)
-    {
-        cuda_malloc ((void **) &(data->d_simulation_data), sizeof (simulation_data), 1, "simulation_data");
-        return SUCCESS;
-    }
-
-    int dev_alloc_workspace (reax_system *system, control_params *control, 
-            storage *workspace, int local_cap, int total_cap, 
-            char *msg)
-    {
-        int i, total_real, total_rvec, local_int, local_real, local_rvec;
-
-        workspace->allocated = 1;
-        total_real = total_cap * sizeof(real);
-        total_rvec = total_cap * sizeof(rvec);
-        local_int = local_cap * sizeof(int);
-        local_real = local_cap * sizeof(real);
-        local_rvec = local_cap * sizeof(rvec);
-
-        /* communication storage */  
-        /*
-           workspace->tmp_dbl = NULL;
-           workspace->tmp_rvec = NULL;
-           workspace->tmp_rvec2 = NULL;
-         */
-
-        //fprintf (stderr, "Deltap and TOTAL BOND ORDER size --> %d \n", total_cap );
-
-        /* bond order related storage  */
-        cuda_malloc ((void **) &workspace->within_bond_box, total_cap * sizeof (int), 1, "skin");
-        cuda_malloc ((void **) &workspace->total_bond_order, total_real, 1, "total_bo");
-        cuda_malloc ((void **) &workspace->Deltap, total_real, 1, "Deltap");
-        cuda_malloc ((void **) &workspace->Deltap_boc, total_real, 1, "Deltap_boc");
-        cuda_malloc ((void **) &workspace->dDeltap_self, total_rvec, 1, "dDeltap_self");
-        cuda_malloc ((void **) &workspace->Delta, total_real, 1, "Delta" );
-        cuda_malloc ((void **) &workspace->Delta_lp, total_real, 1, "Delta_lp" );
-        cuda_malloc ((void **) &workspace->Delta_lp_temp, total_real, 1, "Delta_lp_temp" );
-        cuda_malloc ((void **) &workspace->dDelta_lp, total_real, 1, "Delta_lp_temp" );
-        cuda_malloc ((void **) &workspace->dDelta_lp_temp, total_real, 1, "dDelta_lp_temp" );
-        cuda_malloc ((void **) &workspace->Delta_e, total_real, 1, "Delta_e" );
-        cuda_malloc ((void **) &workspace->Delta_boc, total_real, 1, "Delta_boc");
-        cuda_malloc ((void **) &workspace->nlp, total_real, 1, "nlp");
-        cuda_malloc ((void **) &workspace->nlp_temp, total_real, 1, "nlp_temp");
-        cuda_malloc ((void **) &workspace->Clp, total_real, 1, "Clp");
-        cuda_malloc ((void **) &workspace->vlpex, total_real, 1, "vlpex");
-        cuda_malloc ((void **) &workspace->bond_mark, total_real, 1, "bond_mark");
-        cuda_malloc ((void **) &workspace->done_after, total_real, 1, "done_after");
-
-
-        /* QEq storage */
-        cuda_malloc ((void **) &workspace->Hdia_inv, total_cap * sizeof (real), 1, "Hdia_inv");
-        cuda_malloc ((void **) &workspace->b_s, total_cap * sizeof (real), 1, "b_s");
-        cuda_malloc ((void **) &workspace->b_t, total_cap * sizeof (real), 1, "b_t");
-        cuda_malloc ((void **) &workspace->b_prc, total_cap * sizeof (real), 1, "b_prc");
-        cuda_malloc ((void **) &workspace->b_prm, total_cap * sizeof (real), 1, "b_prm");
-        cuda_malloc ((void **) &workspace->s, total_cap * sizeof (real), 1, "s");
-        cuda_malloc ((void **) &workspace->t, total_cap * sizeof (real), 1, "t");
-        cuda_malloc ((void **) &workspace->droptol, total_cap * sizeof (real), 1, "droptol");
-        cuda_malloc ((void **) &workspace->b, total_cap * sizeof (rvec2), 1, "b");
-        cuda_malloc ((void **) &workspace->x, total_cap * sizeof (rvec2), 1, "x");
-
-        /* GMRES storage */
-        cuda_malloc ((void **) &workspace->y, (RESTART+1)*sizeof (real), 1, "y");
-        cuda_malloc ((void **) &workspace->z, (RESTART+1)*sizeof (real), 1, "z");
-        cuda_malloc ((void **) &workspace->g, (RESTART+1)*sizeof (real), 1, "g");
-        cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "h");
-        cuda_malloc ((void **) &workspace->hs, (RESTART+1)*sizeof (real), 1, "hs");
-        cuda_malloc ((void **) &workspace->hc, (RESTART+1)*sizeof (real), 1, "hc");
-        cuda_malloc ((void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "v");
-
-        /* CG storage */
-        cuda_malloc ((void **) &workspace->r, total_cap * sizeof (real), 1,  "r");
-        cuda_malloc ((void **) &workspace->d, total_cap * sizeof (real), 1, "d");
-        cuda_malloc ((void **) &workspace->q, total_cap * sizeof (real), 1, "q");
-        cuda_malloc ((void **) &workspace->p, total_cap * sizeof (real), 1, "p");
-        cuda_malloc ((void **) &workspace->r2, total_cap * sizeof (rvec2), 1, "r2");
-        cuda_malloc ((void **) &workspace->d2, total_cap * sizeof (rvec2), 1, "d2");
-        cuda_malloc ((void **) &workspace->q2, total_cap * sizeof (rvec2), 1, "q2");
-        cuda_malloc ((void **) &workspace->p2, total_cap * sizeof (rvec2), 1, "p2");
-
-        /* integrator storage */
-        cuda_malloc ((void **) &workspace->v_const, local_rvec, 1, "v_const");
-
-        /* storage for analysis */
-        if( control->molecular_analysis || control->diffusion_coef ) {
-            cuda_malloc ((void **) &workspace->mark, local_cap * sizeof (int), 1, "mark");
-            cuda_malloc ((void **) &workspace->old_mark, local_cap * sizeof (int), 1, "old_mark");
-        }
-        else
-            workspace->mark = workspace->old_mark = NULL;
-
-        if( control->diffusion_coef )
-            cuda_malloc ((void **) &workspace->x_old, local_cap * sizeof (rvec), 1, "x_old");
-        else
-            workspace->x_old = NULL;
-
-        /* force related storage */
-        cuda_malloc ((void **) &workspace->f, total_cap * sizeof (rvec), 1, "f");
-        cuda_malloc ((void **) &workspace->CdDelta, total_cap * sizeof (rvec), 1, "CdDelta");
-
-        /* Taper params */
-        cuda_malloc ((void **) &workspace->Tap, 8 * sizeof (real), 1, "Tap");
-
-        return SUCCESS;
-    }
-
-    int dev_dealloc_workspace (reax_system *system, control_params *control, 
-            storage *workspace, int local_cap, int total_cap, 
-            char *msg)
-    {
-        /* communication storage */  
-        /*
-           workspace->tmp_dbl = NULL;
-           workspace->tmp_rvec = NULL;
-           workspace->tmp_rvec2 = NULL;
-         */
-
-        /* bond order related storage  */
-        cuda_free (workspace->within_bond_box, "skin");
-        cuda_free (workspace->total_bond_order, "total_bo");
-        cuda_free (workspace->Deltap, "Deltap");
-        cuda_free (workspace->Deltap_boc, "Deltap_boc");
-        cuda_free (workspace->dDeltap_self, "dDeltap_self");
-        cuda_free (workspace->Delta, "Delta" );
-        cuda_free (workspace->Delta_lp, "Delta_lp" );
-        cuda_free (workspace->Delta_lp_temp, "Delta_lp_temp" );
-        cuda_free (workspace->dDelta_lp, "Delta_lp_temp" );
-        cuda_free (workspace->dDelta_lp_temp, "dDelta_lp_temp" );
-        cuda_free (workspace->Delta_e, "Delta_e" );
-        cuda_free (workspace->Delta_boc, "Delta_boc");
-        cuda_free (workspace->nlp, "nlp");
-        cuda_free (workspace->nlp_temp, "nlp_temp");
-        cuda_free (workspace->Clp, "Clp");
-        cuda_free (workspace->vlpex, "vlpex");
-        cuda_free (workspace->bond_mark, "bond_mark");
-        cuda_free (workspace->done_after, "done_after");
-
-        /* QEq storage */
-        cuda_free (workspace->Hdia_inv, "Hdia_inv");
-        cuda_free (workspace->b_s, "b_s");
-        cuda_free (workspace->b_t, "b_t");
-        cuda_free (workspace->b_prc, "b_prc");
-        cuda_free (workspace->b_prm, "b_prm");
-        cuda_free (workspace->s, "s");
-        cuda_free (workspace->t, "t");
-        cuda_free (workspace->droptol, "droptol");
-        cuda_free (workspace->b, "b");
-        cuda_free (workspace->x, "x");
-
-        /* GMRES storage */
-        cuda_free (workspace->y, "y");
-        cuda_free (workspace->z, "z");
-        cuda_free (workspace->g, "g");
-        cuda_free (workspace->h, "h");
-        cuda_free (workspace->hs, "hs");
-        cuda_free (workspace->hc, "hc");
-        cuda_free (workspace->v, "v");
-
-        /* CG storage */
-        cuda_free (workspace->r, "r");
-        cuda_free (workspace->d, "d");
-        cuda_free (workspace->q, "q");
-        cuda_free (workspace->p, "p");
-        cuda_free (workspace->r2, "r2");
-        cuda_free (workspace->d2, "d2");
-        cuda_free (workspace->q2, "q2");
-        cuda_free (workspace->p2, "p2");
-
-        /* integrator storage */
-        cuda_free (workspace->v_const, "v_const");
-
-        /* storage for analysis */
-        if( control->molecular_analysis || control->diffusion_coef ) {
-            cuda_free (workspace->mark, "mark");
-            cuda_free (workspace->old_mark, "old_mark");
-        }
-        else
-            workspace->mark = workspace->old_mark = NULL;
-
-        if( control->diffusion_coef )
-            cuda_free (workspace->x_old, "x_old");
-        else
-            workspace->x_old = NULL;
-
-        /* force related storage */
-        cuda_free (workspace->f, "f");
-        cuda_free (workspace->CdDelta, "CdDelta");
-
-        /* Taper params */
-        cuda_free (workspace->Tap, "Tap");
-
-        return FAILURE;
-    }
-
-
-
-
-    int dev_alloc_matrix (sparse_matrix *H, int cap, int m)
-    {
-        //sparse_matrix *H;
-        //H = *pH;
-
-        H->cap = cap;
-        H->m = m;
-        cuda_malloc ((void **) &H->start, sizeof (int) * cap, 1, "matrix_start");
-        cuda_malloc ((void **) &H->end, sizeof (int) * cap, 1, "matrix_end");
-        cuda_malloc ((void **) &H->entries, sizeof (sparse_matrix_entry) * m, 1, "matrix_entries");
-
-        return SUCCESS;
-    }
-
-    int dev_dealloc_matrix (sparse_matrix *H)
-    {
-        cuda_free (H->start, "matrix_start");
-        cuda_free (H->end, "matrix_end");
-        cuda_free (H->entries, "matrix_entries");
-
-        return SUCCESS;
-    }
-
-
-}
-
diff --git a/PG-PuReMD/src/dev_alloc.h b/PG-PuReMD/src/dev_alloc.h
deleted file mode 100644
index 95884933a92c659e89d61aa36508d8bf3dc42ba9..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dev_alloc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __DEV_ALLOC_H_
-#define __DEV_ALLOC_H_
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-
-int dev_alloc_system (reax_system *);
-int dev_alloc_grid (reax_system *);
-int dev_alloc_simulation_data (simulation_data *);
-int dev_alloc_workspace (reax_system *, control_params *, storage *, int, int, char *);
-int dev_alloc_matrix (sparse_matrix *, int, int);
-int dev_alloc_control (control_params *);
-
-int dev_dealloc_grid_cell_atoms (reax_system *);
-int dev_alloc_grid_cell_atoms (reax_system *, int );
-int dev_realloc_system (reax_system *, int , int , char *);
-int dev_dealloc_workspace (reax_system *, control_params *,
-                           storage *, int , int , char *);
-int dev_dealloc_matrix (sparse_matrix *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/dev_list.cu b/PG-PuReMD/src/dev_list.cu
deleted file mode 100644
index 7453fc8e599f0e1176de7d41fb7abf6db181c2d1..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dev_list.cu
+++ /dev/null
@@ -1,112 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "reax_types.h"
-#include "cuda_utils.h"
-
-#if defined(PURE_REAX)
-#include "list.h"
-#include "tool_box.h"
-#elif defined(LAMMPS_REAX)
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#endif
-
-extern "C" {
-
-
-    /************* allocate list space ******************/
-    int Dev_Make_List(int n, int num_intrs, int type, reax_list *l)
-    {
-        l->allocated = 1;
-
-        l->n = n;
-        l->num_intrs = num_intrs;
-
-        cuda_malloc ((void **) &l->index, n * sizeof (int), 1, "list:index");
-        cuda_malloc ((void **) &l->end_index, n * sizeof (int), 1, "list:end_index");
-
-        l->type = type;
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type );
-#endif
-
-        switch(l->type) {
-
-            case TYP_FAR_NEIGHBOR:
-                cuda_malloc ((void **) &l->select.far_nbr_list, 
-                        l->num_intrs * sizeof (far_neighbor_data), 1, "list:far_nbrs");
-                break;
-
-            case TYP_THREE_BODY:
-                cuda_malloc ((void **) &l->select.three_body_list,
-                        l->num_intrs * sizeof (three_body_interaction_data), 1, 
-                        "list:three_bodies" );
-                break;
-
-            case TYP_HBOND:
-                cuda_malloc ((void **) &l->select.hbond_list, 
-                        l->num_intrs * sizeof(hbond_data), 1, "list:hbonds" );
-                break;            
-
-            case TYP_BOND:
-                cuda_malloc ((void **) &l->select.bond_list,
-                        l->num_intrs * sizeof(bond_data), 1, "list:bonds" );
-                break;
-
-            default:
-                fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
-                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
-        }
-
-        return SUCCESS;
-    }
-
-
-    void Dev_Delete_List( reax_list *l)
-    {
-        if( l->allocated == 0 )
-            return;
-        l->allocated = 0;
-
-        cuda_free ( l->index, "index");
-        cuda_free ( l->end_index, "end_index" );
-
-        switch (l->type) {
-            case TYP_HBOND:
-                cuda_free( l->select.hbond_list, "list:hbonds" );
-                break;
-            case TYP_FAR_NEIGHBOR:
-                cuda_free( l->select.far_nbr_list, "list:far_nbrs" );
-                break;
-            case TYP_BOND:
-                cuda_free( l->select.bond_list, "list:bonds" );
-                break;
-            case TYP_THREE_BODY:
-                cuda_free( l->select.three_body_list, "list:three_bodies" );
-                break;
-            default:
-                fprintf (stderr, "ERROR no %d list type defined !\n", l->type);
-                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
-        }
-    }
-
-}
diff --git a/PG-PuReMD/src/dev_system_props.cu b/PG-PuReMD/src/dev_system_props.cu
deleted file mode 100644
index fdb3a567e4c34ef033ee6f7e1ee7e02934409077..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dev_system_props.cu
+++ /dev/null
@@ -1,368 +0,0 @@
-
-#include "dev_system_props.h"
-
-#include "reduction.h"
-#include "cuda_utils.h"
-#include "center_mass.h"
-#include "cuda_copy.h"
-
-#include "vector.h"
-#include "cuda_shuffle.h"
-
-CUDA_GLOBAL void k_compute_total_mass (single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n)
-{
-#if defined(__SM_35__)
-
-    extern __shared__ real my_sbp[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    sdata = 0;
-
-    if (i < n)
-        sdata = sbp [ my_atoms [i].type ].mass;
-    __syncthreads ();
-
-    for(int z = 16; z >=1; z/=2)
-        sdata += shfl ( sdata, z);
-
-    if (threadIdx.x % 32 == 0)
-        my_sbp[threadIdx.x >> 5] = sdata;
-
-    __syncthreads ();
-
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-        if(threadIdx.x < offset)
-            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
-
-        __syncthreads();
-    }
-
-    if(threadIdx.x == 0)
-        block_results[blockIdx.x] = my_sbp[0];
-
-
-#else
-
-    extern __shared__ real sdata [];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    x = 0;
-
-    if (i < n)
-        x = sbp [ my_atoms [i].type ].mass;
-
-    sdata[ threadIdx.x ] = x;
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset)
-            sdata [threadIdx.x] += sdata [threadIdx.x + offset];
-
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-        block_results[ blockIdx.x] = sdata [0];
-
-#endif
-}
-
-extern "C" void dev_compute_total_mass (reax_system *system, real *local_val)
-{
-    real *block_mass = (real *) scratch;
-    cuda_memset (block_mass, 0, sizeof (real) * (1 + BLOCKS_POW_2), "total_mass:tmp");
-
-    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof (real), 
-            cudaMemcpyDeviceToHost, "total_mass:tmp");
-}
-
-CUDA_GLOBAL void k_compute_kinetic_energy (single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n)
-{
-
-#if defined(__SM_35__)
-
-    extern __shared__ real my_sbpdot[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    sdata = 0;
-    rvec p;
-
-    if (i < n) {
-        sdata = sbp [ my_atoms [i].type ].mass;
-        rvec_Scale( p, sdata, my_atoms[ i ].v );
-        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    __syncthreads ();
-
-    for(int z = 16; z >=1; z/=2)
-        sdata += shfl ( sdata, z);
-
-    if (threadIdx.x % 32 == 0)
-        my_sbpdot[threadIdx.x >> 5] = sdata;
-
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset)
-            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
-
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-        block_results[ blockIdx.x] = my_sbpdot[0];
-
-#else
-
-
-    extern __shared__ real sdata [];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    m = 0;
-    rvec p;
-
-    if (i < n) {
-        m = sbp [ my_atoms [i].type ].mass;
-        rvec_Scale( p, m, my_atoms[ i ].v );
-        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    sdata[ threadIdx.x ] = m;
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset)
-            sdata [threadIdx.x] += sdata [threadIdx.x + offset];
-
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-        block_results[ blockIdx.x] = sdata [0];
-
-#endif
-}
-
-extern "C" void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *local_val)
-{
-    real *block_energy = (real *) scratch;
-    cuda_memset (block_energy, 0, sizeof (real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp");
-
-    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    copy_host_device (local_val, block_energy + BLOCKS_POW_2,
-            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-            sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp");
-            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
-            //        sizeof (real), "kinetic_energy");
-            }
-
-            extern "C" void dev_compute_momentum (reax_system *system, rvec xcm, 
-                rvec vcm, rvec amcm)
-            {
-            rvec *l_xcm, *l_vcm, *l_amcm;
-            rvec *r_scratch = (rvec *)scratch;
-
-#if defined( __SM_35__)
-            // xcm
-            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-            l_xcm = r_scratch;
-
-            center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-            (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-
-            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-                (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:xcm");
-
-            // vcm
-            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-            l_vcm = r_scratch;
-
-            center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-                (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-
-            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-                (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm");
-
-            // amcm
-            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-            l_amcm = r_scratch;
-
-            center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-                (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-
-            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-                (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:amcm");
-
-#else
-            cuda_memset ( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-
-            l_xcm = r_scratch;
-            l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
-            l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
-
-            center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
-                (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n);
-            cudaThreadSynchronize (); 
-            cudaCheckError (); 
-
-            center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
-                (l_xcm, l_vcm, l_amcm,
-                 l_xcm + BLOCKS_POW_2, 
-                 l_vcm + BLOCKS_POW_2, 
-                 l_amcm + BLOCKS_POW_2, 
-                 BLOCKS_POW_2);
-            cudaThreadSynchronize (); 
-            cudaCheckError ();
-
-            copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
-            copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
-            copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
-#endif
-            }
-
-extern "C" void dev_compute_inertial_tensor (reax_system *system, real *local_results, rvec my_xcm)
-{
-#if defined(__SM_35__)
-    real *partial_results = (real *) scratch;
-    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-
-    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
-
-#else
-
-    real *partial_results = (real *) scratch;
-    //real *local_results;
-
-    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
-
-    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
-            sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
-#endif
-}
-
-extern "C" void dev_sync_simulation_data (simulation_data *data)
-{
-    Output_Sync_Simulation_Data (data, (simulation_data *)data->d_simulation_data );
-}
-/*
-   CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, 
-   single_body_parameters *sbp, 
-   int n, real *block_results)
-   {
-   extern __shared__ real sken[];
-   rvec p;
-   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-   real x = 0;
-
-   if(i < n)
-   {
-   m = sbp[my_atoms[i].type].mass;
-   rvec_Scale( p, m, my_atoms[i].v );
-   x = 0.5 * rvec_Dot( p, my_atoms[i].v );
-   }
-   sken[threadIdx.x] = x;
-   __syncthreads();
-
-   for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-   {
-   if(threadIdx.x < offset)
-   {   
-   sken[threadIdx.x] += sken[threadIdx.x + offset];
-   }   
-
-   __syncthreads();
-   }
-
-   if(threadIdx.x == 0)
-   {
-   per_block_results[blockIdx.x] = sken[0];
-   }
-   }
-
-   void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *p_ekin)
-   {
-   real *spad = (real *) scratch;
-   cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "kinetic_energy");
-
-   ker_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-   (spad, spad + system->n,  system->n);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> 
-   (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, BLOCKS);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   copy_host_device (p_ekin, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-   sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy");
-   }
- */
diff --git a/PG-PuReMD/src/dev_system_props.h b/PG-PuReMD/src/dev_system_props.h
deleted file mode 100644
index 0a4b6f453f9e2123a9765e627891838ba6dc586f..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dev_system_props.h
+++ /dev/null
@@ -1,23 +0,0 @@
-
-#ifndef __DEV_SYSTEM_PROPS_H__
-#define __DEV_SYSTEM_PROPS_H__
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-void dev_compute_total_mass (reax_system *, real *);
-void dev_compute_kinetic_energy (reax_system *, simulation_data *, real *);
-void dev_compute_momentum (reax_system *, rvec, rvec, rvec );
-void dev_compute_inertial_tensor (reax_system *, real *, rvec my_xcm);
-
-void dev_sync_simulation_data (simulation_data *);
-//void dev_compute_kinetic_energy (reax_system *, simulation_data *, real *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/dual_matvec.cu b/PG-PuReMD/src/dual_matvec.cu
deleted file mode 100644
index a674118f789002cb35e3177a0cf71bc74dfcf644..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dual_matvec.cu
+++ /dev/null
@@ -1,140 +0,0 @@
-
-#include "matvec.h"
-#include "cuda_shuffle.h"
-
-//one thread per row
-CUDA_GLOBAL void k_dual_matvec(sparse_matrix H, rvec2 *vec, rvec2 *results, int rows)
-{
-    rvec2 results_row;
-    int col;
-    real val;
-
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= rows) return;
-
-    results_row [0] = results_row[1] = 0;
-
-    for (int c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row[0] += val * vec [col][0];
-        results_row[1] += val * vec [col][1];
-    }
-
-    results [i][0] = results_row[0];
-    results [i][1] = results_row[1];
-}
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
-CUDA_GLOBAL void  k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
-{
-#if defined(__SM_35__)
-
-    rvec2 vals;
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
-
-    int row_start;
-    int row_end;
-
-    // one warp per row
-    int row = warp_id;
-
-    vals[0] = 0;
-    vals[1] = 0;
-
-    if (row < num_rows) {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) {
-            vals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-            vals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-        }
-    }
-
-    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2){
-        vals[0] += shfl( vals[0], s);
-        vals[1] += shfl( vals[1], s);
-    }
-
-    if (lane == 0 && row < num_rows){
-        results[row][0] = vals[0];
-        results[row][1] = vals[1];
-    }
-
-#else
-
-
-    extern __shared__ rvec2 vals [];
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / 32;
-    int lane = thread_id & (32 - 1);
-
-    int row_start;
-    int row_end;
-
-    // one warp per row
-    //int row = warp_id;
-    int row = warp_id;
-    //if (row < num_rows)
-    {
-        vals[threadIdx.x][0] = 0;
-        vals[threadIdx.x][1] = 0;
-
-        if (row < num_rows) {
-            row_start = H.start[row];
-            row_end = H.end[row];
-
-            // compute running sum per thread
-            for(int jj = row_start + lane; jj < row_end; jj += 32) {
-                vals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-                vals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-            }
-        }
-
-        __syncthreads ();
-
-        // parallel reduction in shared memory
-        //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-        if (lane < 16) {
-            vals[threadIdx.x][0] += vals[threadIdx.x + 16][0]; 
-            vals[threadIdx.x][1] += vals[threadIdx.x + 16][1]; 
-        }
-        __syncthreads();
-        if (lane < 8) {
-            vals[threadIdx.x][0] += vals[threadIdx.x + 8][0]; 
-            vals[threadIdx.x][1] += vals[threadIdx.x + 8][1]; 
-        }
-        __syncthreads ();
-        if (lane < 4) {
-            vals[threadIdx.x][0] += vals[threadIdx.x + 4][0]; 
-            vals[threadIdx.x][1] += vals[threadIdx.x + 4][1]; 
-        }
-        __syncthreads ();
-        if (lane < 2) {
-            vals[threadIdx.x][0] += vals[threadIdx.x + 2][0]; 
-            vals[threadIdx.x][1] += vals[threadIdx.x + 2][1]; 
-        }
-        __syncthreads ();
-        if (lane < 1) {
-            vals[threadIdx.x][0] += vals[threadIdx.x + 1][0]; 
-            vals[threadIdx.x][1] += vals[threadIdx.x + 1][1]; 
-        }
-        __syncthreads ();
-
-        // first thread writes the result
-        if (lane == 0 && row < num_rows) {
-            results[row][0] = vals[threadIdx.x][0];
-            results[row][1] = vals[threadIdx.x][1];
-        }
-    }
-
-#endif
-}
diff --git a/PG-PuReMD/src/dual_matvec.h b/PG-PuReMD/src/dual_matvec.h
deleted file mode 100644
index c5b939488ab018719bf3c05ac41d63ac1b3ab123..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/dual_matvec.h
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-#ifndef __DUAL_MATVEC__H_
-#define __DUAL_MATVEC__H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void k_dual_matvec (sparse_matrix , rvec2 *, rvec2 *, int );
-CUDA_GLOBAL void k_dual_matvec_csr(sparse_matrix , rvec2 *, rvec2 *, int );
-
-#endif
diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c
index a260e6281da24ddc138910030771813f3d5a3189..4302e57c6aa4c715f2ab1f5d6abea8f6b0f64106 100644
--- a/PG-PuReMD/src/ffield.c
+++ b/PG-PuReMD/src/ffield.c
@@ -20,43 +20,42 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "ffield.h"
-#include "tool_box.h"
+  #include "ffield.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_ffield.h"
-#include "reax_tool_box.h"
+  #include "reax_ffield.h"
+  #include "reax_tool_box.h"
 #endif
 
 
-char Read_Force_Field( char *ffield_file, reax_interaction *reax,
-                       control_params *control )
+int Read_Force_Field( char *ffield_file, reax_interaction *reax,
+        reax_system *system, control_params *control )
 {
-    FILE    *fp;
-    char    *s;
-    char   **tmp;
-    //SUDHIR
-    //char ****tor_flag;
+    FILE *fp;
+    char *s;
+    char **tmp;
     char *tor_flag;
-    int      c, i, j, k, l, m, n, o, p, cnt;
-    real     val;
-
-    //SUDHIR
+    int c, i, j, k, l, m, n, o, p, cnt;
+    real val;
     int __N;
     int index1, index2;
 
     /* open force field file */
     if ( (fp = fopen( ffield_file, "r" ) ) == NULL )
     {
-        fprintf( stderr, "error opening the force filed file! terminating...\n" );
+        fprintf( stderr, "[ERROR] p%d: cannot open force field file! terminating...\n",
+              system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    s = (char*) smalloc( sizeof(char) * MAX_LINE, "READ_FFIELD" );
+    tmp = (char**) smalloc( sizeof(char*)*MAX_TOKENS, "READ_FFIELD");
     for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN);
-
+    {
+        tmp[i] = (char*) smalloc( sizeof(char) * MAX_TOKEN_LEN, "READ_FFIELD" );
+    }
 
     /* reading first header comment */
     fgets( s, MAX_LINE, fp );
@@ -67,14 +66,15 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
 
     /* reading the number of global parameters */
     n = atoi(tmp[0]);
-    if (n < 1)
+    if ( n < 1 )
     {
-        fprintf( stderr, "WARNING: number of globals in ffield file is 0!\n" );
-        return 1;
+        fprintf( stderr, "[WARNING] p%d: number of globals in ffield file is 0!\n",
+              system->my_rank );
+        return SUCCESS;
     }
 
     reax->gp.n_global = n;
-    reax->gp.l = (real*) malloc(sizeof(real) * n);
+    reax->gp.l = (real*) smalloc( sizeof(real) * n, "READ_FFIELD" );
 
     /* see reax_types.h for mapping between l[i] and the lambdas used in ff */
     for (i = 0; i < n; i++)
@@ -86,7 +86,7 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         reax->gp.l[i] = val;
     }
 
-    control->bo_cut    = 0.01 * reax->gp.l[29];
+    control->bo_cut = 0.01 * reax->gp.l[29];
     control->nonb_low  = reax->gp.l[11];
     control->nonb_cut  = reax->gp.l[12];
 
@@ -101,81 +101,41 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
     fgets(s, MAX_LINE, fp);
 
     /* Allocating structures in reax_interaction */
-    /*
-    reax->sbp = (single_body_parameters*)
-      scalloc( reax->num_atom_types, sizeof(single_body_parameters), "sbp" );
-    reax->tbp = (two_body_parameters**)
-      scalloc( reax->num_atom_types, sizeof(two_body_parameters*), "tbp" );
-    reax->thbp= (three_body_header***)
-      scalloc( reax->num_atom_types, sizeof(three_body_header**), "thbp" );
-    reax->hbp = (hbond_parameters***)
-      scalloc( reax->num_atom_types, sizeof(hbond_parameters**), "hbp" );
-    reax->fbp = (four_body_header****)
-      scalloc( reax->num_atom_types, sizeof(four_body_header***), "fbp" );
-    tor_flag  = (char****)
-      scalloc( reax->num_atom_types, sizeof(char***), "tor_flag" );
-
-    for( i = 0; i < reax->num_atom_types; i++ ) {
-      reax->tbp[i] = (two_body_parameters*)
-        scalloc( reax->num_atom_types, sizeof(two_body_parameters), "tbp[i]" );
-      reax->thbp[i]= (three_body_header**)
-        scalloc( reax->num_atom_types, sizeof(three_body_header*), "thbp[i]" );
-      reax->hbp[i] = (hbond_parameters**)
-        scalloc( reax->num_atom_types, sizeof(hbond_parameters*), "hbp[i]" );
-      reax->fbp[i] = (four_body_header***)
-        scalloc( reax->num_atom_types, sizeof(four_body_header**), "fbp[i]" );
-      tor_flag[i]  = (char***)
-        scalloc( reax->num_atom_types, sizeof(char**), "tor_flag[i]" );
-
-      for( j = 0; j < reax->num_atom_types; j++ ) {
-        reax->thbp[i][j]= (three_body_header*)
-    scalloc( reax->num_atom_types, sizeof(three_body_header), "thbp[i,j]" );
-        reax->hbp[i][j] = (hbond_parameters*)
-    scalloc( reax->num_atom_types, sizeof(hbond_parameters), "hbp[i,j]" );
-        reax->fbp[i][j] = (four_body_header**)
-    scalloc( reax->num_atom_types, sizeof(four_body_header*), "fbp[i,j]" );
-        tor_flag[i][j]  = (char**)
-    scalloc( reax->num_atom_types, sizeof(char*), "tor_flag[i,j]" );
-
-        for (k=0; k < reax->num_atom_types; k++) {
-    reax->fbp[i][j][k] = (four_body_header*)
-      scalloc(reax->num_atom_types, sizeof(four_body_header), "fbp[i,j,k]");
-    tor_flag[i][j][k]  = (char*)
-      scalloc( reax->num_atom_types, sizeof(char), "tor_flag[i,j,k]" );
-        }
-      }
-    }
-    */
-
     __N = reax->num_atom_types;
 
     reax->sbp = (single_body_parameters*)
-                calloc( reax->num_atom_types, sizeof(single_body_parameters) );
+        scalloc( reax->num_atom_types, sizeof(single_body_parameters),
+                "Read_Force_Field::reax->sbp" );
 
     reax->tbp = (two_body_parameters*)
-                calloc( pow (reax->num_atom_types, 2), sizeof(two_body_parameters) );
+        scalloc( POW(reax->num_atom_types, 2), sizeof(two_body_parameters),
+              "Read_Force_Field::reax->tbp" );
 
     reax->thbp = (three_body_header*)
-                 calloc( pow (reax->num_atom_types, 3), sizeof(three_body_header) );
+        scalloc( POW(reax->num_atom_types, 3), sizeof(three_body_header),
+              "Read_Force_Field::reax->thbp" );
+
     reax->hbp = (hbond_parameters*)
-                calloc( pow (reax->num_atom_types, 3), sizeof(hbond_parameters) );
+        scalloc( POW(reax->num_atom_types, 3), sizeof(hbond_parameters),
+              "Read_Force_Field::reax->hbp" );
 
     reax->fbp = (four_body_header*)
-                calloc( pow (reax->num_atom_types, 4), sizeof(four_body_header) );
+        scalloc( POW(reax->num_atom_types, 4), sizeof(four_body_header),
+              "Read_Force_Field::reax->fbp" );
 
-    tor_flag  = (char*)
-                calloc( pow (reax->num_atom_types, 4), sizeof(char) );
+    tor_flag  = (char*) scalloc( POW(reax->num_atom_types, 4), sizeof(char),
+           "Read_Force_Field::tor_flag" );
 
-
-    // vdWaals type: 1: Shielded Morse, no inner-wall
-    //               2: inner wall, no shielding
-    //               3: inner wall+shielding
+    /* vdWaals type:
+     * 1: Shielded Morse, no inner-wall
+     * 2: inner wall, no shielding
+     * 3: inner wall+shielding */
     reax->gp.vdw_type = 0;
 
     /* reading single atom parameters */
     /* there are 4 lines of each single atom parameters in ff files. these
-       parameters later determine some of the pair and triplet parameters using
-       combination rules. */
+     * parameters later determine some of the pair and triplet parameters using
+     * combination rules. */
     for ( i = 0; i < reax->num_atom_types; i++ )
     {
         /* line one */
@@ -183,26 +143,31 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         c = Tokenize( s, &tmp );
 
         for ( j = 0; j < (int)(strlen(tmp[0])); ++j )
+        {
             reax->sbp[i].name[j] = toupper( tmp[0][j] );
+        }
 
-        //fprintf( stderr, "Atom Name in the force field : %s \n", reax->sbp[i].name);
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: Atom Name in the force field : %s \n",
+                system->my_rank, reax->sbp[i].name );
+#endif
 
         val = atof(tmp[1]);
-        reax->sbp[i].r_s        = val;
+        reax->sbp[i].r_s = val;
         val = atof(tmp[2]);
-        reax->sbp[i].valency    = val;
+        reax->sbp[i].valency = val;
         val = atof(tmp[3]);
-        reax->sbp[i].mass       = val;
+        reax->sbp[i].mass = val;
         val = atof(tmp[4]);
-        reax->sbp[i].r_vdw      = val;
+        reax->sbp[i].r_vdw = val;
         val = atof(tmp[5]);
-        reax->sbp[i].epsilon    = val;
+        reax->sbp[i].epsilon = val;
         val = atof(tmp[6]);
-        reax->sbp[i].gamma      = val;
+        reax->sbp[i].gamma = val;
         val = atof(tmp[7]);
-        reax->sbp[i].r_pi       = val;
+        reax->sbp[i].r_pi = val;
         val = atof(tmp[8]);
-        reax->sbp[i].valency_e  = val;
+        reax->sbp[i].valency_e = val;
         reax->sbp[i].nlp_opt = 0.5 * (reax->sbp[i].valency_e - reax->sbp[i].valency);
 
         /* line two */
@@ -210,36 +175,36 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         c = Tokenize( s, &tmp );
 
         val = atof(tmp[0]);
-        reax->sbp[i].alpha      = val;
+        reax->sbp[i].alpha = val;
         val = atof(tmp[1]);
-        reax->sbp[i].gamma_w    = val;
+        reax->sbp[i].gamma_w = val;
         val = atof(tmp[2]);
         reax->sbp[i].valency_boc = val;
         val = atof(tmp[3]);
-        reax->sbp[i].p_ovun5    = val;
+        reax->sbp[i].p_ovun5 = val;
         val = atof(tmp[4]);
         val = atof(tmp[5]);
-        reax->sbp[i].chi        = val;
+        reax->sbp[i].chi = val;
         val = atof(tmp[6]);
-        reax->sbp[i].eta        = 2.0 * val;
+        reax->sbp[i].eta = 2.0 * val;
         val = atof(tmp[7]);
-        reax->sbp[i].p_hbond = (int) val;
+        reax->sbp[i].p_hbond = (int)val;
 
         /* line 3 */
         fgets( s, MAX_LINE, fp );
         c = Tokenize( s, &tmp );
 
         val = atof(tmp[0]);
-        reax->sbp[i].r_pi_pi    = val;
+        reax->sbp[i].r_pi_pi = val;
         val = atof(tmp[1]);
-        reax->sbp[i].p_lp2      = val;
+        reax->sbp[i].p_lp2 = val;
         val = atof(tmp[2]);
         val = atof(tmp[3]);
-        reax->sbp[i].b_o_131    = val;
+        reax->sbp[i].b_o_131 = val;
         val = atof(tmp[4]);
-        reax->sbp[i].b_o_132    = val;
+        reax->sbp[i].b_o_132 = val;
         val = atof(tmp[5]);
-        reax->sbp[i].b_o_133    = val;
+        reax->sbp[i].b_o_133 = val;
         val = atof(tmp[6]);
         val = atof(tmp[7]);
 
@@ -248,96 +213,104 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         c = Tokenize( s, &tmp );
 
         val = atof(tmp[0]);
-        reax->sbp[i].p_ovun2    = val;
+        reax->sbp[i].p_ovun2 = val;
         val = atof(tmp[1]);
-        reax->sbp[i].p_val3     = val;
+        reax->sbp[i].p_val3 = val;
         val = atof(tmp[2]);
         val = atof(tmp[3]);
         reax->sbp[i].valency_val = val;
         val = atof(tmp[4]);
-        reax->sbp[i].p_val5     = val;
+        reax->sbp[i].p_val5 = val;
         val = atof(tmp[5]);
-        reax->sbp[i].rcore2     = val;
+        reax->sbp[i].rcore2 = val;
         val = atof(tmp[6]);
-        reax->sbp[i].ecore2     = val;
+        reax->sbp[i].ecore2 = val;
         val = atof(tmp[7]);
-        reax->sbp[i].acore2     = val;
-
+        reax->sbp[i].acore2 = val;
 
-        if ( reax->sbp[i].rcore2 > 0.01 && reax->sbp[i].acore2 > 0.01 ) // Inner-wall
+        /* Inner-wall */
+        if ( reax->sbp[i].rcore2 > 0.01 && reax->sbp[i].acore2 > 0.01 )
         {
-            if ( reax->sbp[i].gamma_w > 0.5 ) // Shielding vdWaals
+            /* Shielding vdWaals */
+            if ( reax->sbp[i].gamma_w > 0.5 )
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 3 )
-                    fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
-                             "Force field parameters for element %s\n"        \
-                             "indicate inner wall+shielding, but earlier\n"   \
-                             "atoms indicate different vdWaals-method.\n"     \
-                             "This may cause division-by-zero errors.\n"      \
-                             "Keeping vdWaals-setting for earlier atoms.\n",
-                             reax->sbp[i].name );
+                {
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n"
+                            "Force field parameters for element %s\n"
+                            "indicate inner wall+shielding, but earlier\n"
+                            "atoms indicate different vdWaals-method.\n"
+                            "This may cause division-by-zero errors.\n"
+                            "Keeping vdWaals-setting for earlier atoms.\n",
+                            system->my_rank, reax->sbp[i].name );
+                }
                 else
                 {
                     reax->gp.vdw_type = 3;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element %s: Shielding+inner-wall",
-                             reax->sbp[i].name );
+                    fprintf( stderr, "p%d: vdWaals type for element %s: Shielding+inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
-            else    // No shielding vdWaals parameters present
+            /* No shielding vdWaals parameters present */
+            else
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 2 )
-                    fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
-                             "Force field parameters for element %s\n"        \
-                             "indicate inner wall without shielding, but earlier\n" \
-                             "atoms indicate different vdWaals-method.\n"     \
-                             "This may cause division-by-zero errors.\n"      \
-                             "Keeping vdWaals-setting for earlier atoms.\n",
-                             reax->sbp[i].name );
+                {
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n",
+                            system->my_rank );
+                    fprintf( stderr, "    [INFO] Force field parameters for element %s\n", reax->sbp[i].name );
+                    fprintf( stderr, "    [INFO] indicate inner wall without shielding, but earlier\n" );
+                    fprintf( stderr, "    [INFO] atoms indicate different vdWaals-method.\n" );
+                    fprintf( stderr, "    [INFO] This may cause division-by-zero errors.\n" );
+                    fprintf( stderr, "    [INFO] Keeping vdWaals-setting for earlier atoms.\n" );
+                }
                 else
                 {
                     reax->gp.vdw_type = 2;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element%s: No Shielding,inner-wall",
-                             reax->sbp[i].name );
+                    fprintf( stderr, "p%d: vdWaals type for element%s: No Shielding,inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
         }
-        else  // No Inner wall parameters present
+        /* No Inner wall parameters present */
+        else
         {
-            if ( reax->sbp[i].gamma_w > 0.5 ) // Shielding vdWaals
+            /* Shielding vdWaals */
+            if ( reax->sbp[i].gamma_w > 0.5 )
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 1 )
-                    fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
-                             "Force field parameters for element %s\n"        \
-                             "indicate  shielding without inner wall, but earlier\n" \
-                             "atoms indicate different vdWaals-method.\n"     \
-                             "This may cause division-by-zero errors.\n"      \
-                             "Keeping vdWaals-setting for earlier atoms.\n",
-                             reax->sbp[i].name );
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n" \
+                            "    [INFO] Force field parameters for element %s\n"        \
+                            "    [INFO] indicate  shielding without inner wall, but earlier\n" \
+                            "    [INFO] atoms indicate different vdWaals-method.\n"     \
+                            "    [INFO] This may cause division-by-zero errors.\n"      \
+                            "    [INFO] Keeping vdWaals-setting for earlier atoms.\n",
+                            system->my_rank, reax->sbp[i].name );
                 else
                 {
                     reax->gp.vdw_type = 1;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element%s: Shielding,no inner-wall",
-                             reax->sbp[i].name );
+                    fprintf( stderr, "p%d, vdWaals type for element%s: Shielding,no inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
             else
             {
-                fprintf( stderr, "Error: inconsistent vdWaals-parameters\n"\
-                         "No shielding or inner-wall set for element %s\n",
-                         reax->sbp[i].name );
+                fprintf( stderr, "[ERROR] p%d: inconsistent vdWaals-parameters\n" \
+                         "    [INFO] No shielding or inner-wall set for element %s\n",
+                         system->my_rank, reax->sbp[i].name );
                 MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
         }
     }
 
 #if defined(DEBUG)
-    fprintf( stderr, "vdWaals type: %d\n", reax->gp.vdw_type );
+    fprintf( stderr, "p%d: vdWaals type: %d\n", system->my_rank, reax->gp.vdw_type );
 #endif
 
     /* Equate vval3 to valf for first-row elements (25/10/2004) */
@@ -346,8 +319,8 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         if ( reax->sbp[i].mass < 21 &&
                 reax->sbp[i].valency_val != reax->sbp[i].valency_boc )
         {
-            fprintf( stderr, "Warning: changed valency_val to valency_boc for %s\n",
-                     reax->sbp[i].name );
+            fprintf( stderr, "[WARNING] p%d: changed valency_val to valency_boc for atom type %s\n",
+                    system->my_rank, reax->sbp[i].name );
             reax->sbp[i].valency_val = reax->sbp[i].valency_boc;
         }
     }
@@ -368,35 +341,33 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
 
         j = atoi(tmp[0]) - 1;
         k = atoi(tmp[1]) - 1;
-
         index1 = j * __N + k;
         index2 = k * __N + j;
 
         if (j < reax->num_atom_types && k < reax->num_atom_types)
         {
-
             val = atof(tmp[2]);
-            reax->tbp[ index1 ].De_s      = val;
-            reax->tbp[ index2 ].De_s      = val;
+            reax->tbp[ index1 ].De_s = val;
+            reax->tbp[ index2 ].De_s = val;
             val = atof(tmp[3]);
-            reax->tbp[ index1 ].De_p      = val;
-            reax->tbp[ index2 ].De_p      = val;
+            reax->tbp[ index1 ].De_p = val;
+            reax->tbp[ index2 ].De_p = val;
             val = atof(tmp[4]);
-            reax->tbp[ index1 ].De_pp     = val;
-            reax->tbp[ index2 ].De_pp     = val;
+            reax->tbp[ index1 ].De_pp = val;
+            reax->tbp[ index2 ].De_pp = val;
             val = atof(tmp[5]);
-            reax->tbp[ index1 ].p_be1     = val;
-            reax->tbp[ index2 ].p_be1     = val;
+            reax->tbp[ index1 ].p_be1 = val;
+            reax->tbp[ index2 ].p_be1 = val;
             val = atof(tmp[6]);
-            reax->tbp[ index1 ].p_bo5     = val;
-            reax->tbp[ index2 ].p_bo5     = val;
+            reax->tbp[ index1 ].p_bo5 = val;
+            reax->tbp[ index2 ].p_bo5 = val;
             val = atof(tmp[7]);
-            reax->tbp[ index1 ].v13cor    = val;
-            reax->tbp[ index2 ].v13cor    = val;
+            reax->tbp[ index1 ].v13cor = val;
+            reax->tbp[ index2 ].v13cor = val;
 
             val = atof(tmp[8]);
-            reax->tbp[ index1 ].p_bo6     = val;
-            reax->tbp[ index2 ].p_bo6     = val;
+            reax->tbp[ index1 ].p_bo6 = val;
+            reax->tbp[ index2 ].p_bo6 = val;
             val = atof(tmp[9]);
             reax->tbp[ index1 ].p_ovun1 = val;
             reax->tbp[ index2 ].p_ovun1 = val;
@@ -406,127 +377,109 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
             c = Tokenize(s, &tmp);
 
             val = atof(tmp[0]);
-            reax->tbp[ index1 ].p_be2     = val;
-            reax->tbp[ index2 ].p_be2     = val;
+            reax->tbp[ index1 ].p_be2 = val;
+            reax->tbp[ index2 ].p_be2 = val;
             val = atof(tmp[1]);
-            reax->tbp[ index1 ].p_bo3     = val;
-            reax->tbp[ index2 ].p_bo3     = val;
+            reax->tbp[ index1 ].p_bo3 = val;
+            reax->tbp[ index2 ].p_bo3 = val;
             val = atof(tmp[2]);
-            reax->tbp[ index1 ].p_bo4     = val;
-            reax->tbp[ index2 ].p_bo4     = val;
+            reax->tbp[ index1 ].p_bo4 = val;
+            reax->tbp[ index2 ].p_bo4 = val;
             val = atof(tmp[3]);
 
             val = atof(tmp[4]);
-            reax->tbp[ index1 ].p_bo1     = val;
-            reax->tbp[ index2 ].p_bo1     = val;
+            reax->tbp[ index1 ].p_bo1 = val;
+            reax->tbp[ index2 ].p_bo1 = val;
             val = atof(tmp[5]);
-            reax->tbp[ index1 ].p_bo2     = val;
-            reax->tbp[ index2 ].p_bo2     = val;
+            reax->tbp[ index1 ].p_bo2 = val;
+            reax->tbp[ index2 ].p_bo2 = val;
             val = atof(tmp[6]);
-            reax->tbp[ index1 ].ovc       = val;
-            reax->tbp[ index2 ].ovc       = val;
+            reax->tbp[ index1 ].ovc = val;
+            reax->tbp[ index2 ].ovc = val;
 
             val = atof(tmp[7]);
         }
     }
 
     /* calculating combination rules and filling up remaining fields. */
-
     for (i = 0; i < reax->num_atom_types; i++)
+    {
         for (j = i; j < reax->num_atom_types; j++)
         {
-
-            //SUDHIR
             index1 = i * __N + j;
             index2 = j * __N + i;
 
-            reax->tbp[index1].r_s = 0.5 *
-                                    (reax->sbp[i].r_s + reax->sbp[j].r_s);
-            reax->tbp[index2].r_s = 0.5 *
-                                    (reax->sbp[j].r_s + reax->sbp[i].r_s);
+            reax->tbp[index1].r_s =
+                0.5 * (reax->sbp[i].r_s + reax->sbp[j].r_s);
+            reax->tbp[index2].r_s =
+                0.5 * (reax->sbp[j].r_s + reax->sbp[i].r_s);
 
-            reax->tbp[index1].r_p = 0.5 *
-                                    (reax->sbp[i].r_pi + reax->sbp[j].r_pi);
-            reax->tbp[index2].r_p = 0.5 *
-                                    (reax->sbp[j].r_pi + reax->sbp[i].r_pi);
-
-            reax->tbp[index1].r_pp = 0.5 *
-                                     (reax->sbp[i].r_pi_pi + reax->sbp[j].r_pi_pi);
-            reax->tbp[index2].r_pp = 0.5 *
-                                     (reax->sbp[j].r_pi_pi + reax->sbp[i].r_pi_pi);
+            reax->tbp[index1].r_p =
+                0.5 * (reax->sbp[i].r_pi + reax->sbp[j].r_pi);
+            reax->tbp[index2].r_p =
+                0.5 * (reax->sbp[j].r_pi + reax->sbp[i].r_pi);
 
+            reax->tbp[index1].r_pp =
+                0.5 * (reax->sbp[i].r_pi_pi + reax->sbp[j].r_pi_pi);
+            reax->tbp[index2].r_pp =
+                0.5 * (reax->sbp[j].r_pi_pi + reax->sbp[i].r_pi_pi);
 
             reax->tbp[index1].p_boc3 =
-                sqrt(reax->sbp[i].b_o_132 *
-                     reax->sbp[j].b_o_132);
+                SQRT(reax->sbp[i].b_o_132 * reax->sbp[j].b_o_132);
             reax->tbp[index2].p_boc3 =
-                sqrt(reax->sbp[j].b_o_132 *
-                     reax->sbp[i].b_o_132);
+                SQRT(reax->sbp[j].b_o_132 * reax->sbp[i].b_o_132);
 
             reax->tbp[index1].p_boc4 =
-                sqrt(reax->sbp[i].b_o_131 *
-                     reax->sbp[j].b_o_131);
+                SQRT(reax->sbp[i].b_o_131 * reax->sbp[j].b_o_131);
             reax->tbp[index2].p_boc4 =
-                sqrt(reax->sbp[j].b_o_131 *
-                     reax->sbp[i].b_o_131);
+                SQRT(reax->sbp[j].b_o_131 * reax->sbp[i].b_o_131);
 
             reax->tbp[index1].p_boc5 =
-                sqrt(reax->sbp[i].b_o_133 *
-                     reax->sbp[j].b_o_133);
+                SQRT(reax->sbp[i].b_o_133 * reax->sbp[j].b_o_133);
             reax->tbp[index2].p_boc5 =
-                sqrt(reax->sbp[j].b_o_133 *
-                     reax->sbp[i].b_o_133);
-
+                SQRT(reax->sbp[j].b_o_133 * reax->sbp[i].b_o_133);
 
             reax->tbp[index1].D =
-                sqrt(reax->sbp[i].epsilon *
-                     reax->sbp[j].epsilon);
+                SQRT(reax->sbp[i].epsilon * reax->sbp[j].epsilon);
 
             reax->tbp[index2].D =
-                sqrt(reax->sbp[j].epsilon *
-                     reax->sbp[i].epsilon);
+                SQRT(reax->sbp[j].epsilon * reax->sbp[i].epsilon);
 
             reax->tbp[index1].alpha =
-                sqrt(reax->sbp[i].alpha *
-                     reax->sbp[j].alpha);
+                SQRT(reax->sbp[i].alpha * reax->sbp[j].alpha);
 
             reax->tbp[index2].alpha =
-                sqrt(reax->sbp[j].alpha *
-                     reax->sbp[i].alpha);
+                SQRT(reax->sbp[j].alpha * reax->sbp[i].alpha);
 
             reax->tbp[index1].r_vdW =
-                2.0 * sqrt(reax->sbp[i].r_vdw * reax->sbp[j].r_vdw);
+                2.0 * SQRT(reax->sbp[i].r_vdw * reax->sbp[j].r_vdw);
 
             reax->tbp[index2].r_vdW =
-                2.0 * sqrt(reax->sbp[j].r_vdw * reax->sbp[i].r_vdw);
+                2.0 * SQRT(reax->sbp[j].r_vdw * reax->sbp[i].r_vdw);
 
             reax->tbp[index1].gamma_w =
-                sqrt(reax->sbp[i].gamma_w *
-                     reax->sbp[j].gamma_w);
+                SQRT(reax->sbp[i].gamma_w * reax->sbp[j].gamma_w);
 
             reax->tbp[index2].gamma_w =
-                sqrt(reax->sbp[j].gamma_w *
-                     reax->sbp[i].gamma_w);
+                SQRT(reax->sbp[j].gamma_w * reax->sbp[i].gamma_w);
 
             reax->tbp[index1].gamma =
-                POW(reax->sbp[i].gamma *
-                    reax->sbp[j].gamma, -1.5);
+                POW(reax->sbp[i].gamma * reax->sbp[j].gamma, -1.5);
 
             reax->tbp[index2].gamma =
-                POW(reax->sbp[j].gamma *
-                    reax->sbp[i].gamma, -1.5);
+                POW(reax->sbp[j].gamma * reax->sbp[i].gamma, -1.5);
 
-            // additions for additional vdWaals interaction types - inner core
+            /* additions for additional vdWaals interaction types - inner core */
             reax->tbp[index1].rcore = reax->tbp[index2].rcore =
-                                          sqrt( reax->sbp[i].rcore2 * reax->sbp[j].rcore2 );
+                SQRT( reax->sbp[i].rcore2 * reax->sbp[j].rcore2 );
 
             reax->tbp[index1].ecore = reax->tbp[index2].ecore =
-                                          sqrt( reax->sbp[i].ecore2 * reax->sbp[j].ecore2 );
+                SQRT( reax->sbp[i].ecore2 * reax->sbp[j].ecore2 );
 
             reax->tbp[index1].acore = reax->tbp[index2].acore =
-                                          sqrt( reax->sbp[i].acore2 * reax->sbp[j].acore2 );
+                SQRT( reax->sbp[i].acore2 * reax->sbp[j].acore2 );
         }
-
+    }
 
     /* next line is number of two body offdiagonal combinations and comments */
     /* these are two body offdiagonal terms that are different from the
@@ -543,7 +496,6 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         j = atoi(tmp[0]) - 1;
         k = atoi(tmp[1]) - 1;
 
-        //SUDHIR
         index1 = j * __N + k;
         index2 = k * __N + j;
 
@@ -593,17 +545,19 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         }
     }
 
-
     /* 3-body parameters -
        supports multi-well potentials (upto MAX_3BODY_PARAM in mytypes.h) */
     /* clear entries first */
     for ( i = 0; i < reax->num_atom_types; ++i )
+    {
         for ( j = 0; j < reax->num_atom_types; ++j )
+        {
             for ( k = 0; k < reax->num_atom_types; ++k )
-                //reax->thbp[i][j][k].cnt = 0;
-                //SUDHIR
+            {
                 reax->thbp[i * __N * __N + j * __N + k].cnt = 0;
-
+            }
+        }
+    }
 
     /* next line is number of 3-body params and some comments */
     fgets( s, MAX_LINE, fp );
@@ -618,8 +572,6 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         j = atoi(tmp[0]) - 1;
         k = atoi(tmp[1]) - 1;
         m = atoi(tmp[2]) - 1;
-
-        //SUDHIR
         index1 = j * __N * __N + k * __N + m;
         index2 = m * __N * __N + k * __N + j;
 
@@ -660,28 +612,30 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         }
     }
 
-
     /* 4-body parameters are entered in compact form. i.e. 0-X-Y-0
-       correspond to any type of pair of atoms in 1 and 4
-       position. However, explicit X-Y-Z-W takes precedence over the
-       default description.
-       supports multi-well potentials (upto MAX_4BODY_PARAM in mytypes.h)
-       IMPORTANT: for now, directions on how to read multi-entries from ffield
-       is not clear */
+     * correspond to any type of pair of atoms in 1 and 4
+     * position. However, explicit X-Y-Z-W takes precedence over the
+     * default description.
+     * supports multi-well potentials (upto MAX_4BODY_PARAM in mytypes.h)
+     * IMPORTANT: for now, directions on how to read multi-entries from ffield
+     * is not clear */
 
     /* clear all entries first */
     for ( i = 0; i < reax->num_atom_types; ++i )
+    {
         for ( j = 0; j < reax->num_atom_types; ++j )
+        {
             for ( k = 0; k < reax->num_atom_types; ++k )
+            {
                 for ( m = 0; m < reax->num_atom_types; ++m )
                 {
-                    //reax->fbp[i][j][k][m].cnt = 0;
-                    //tor_flag[i][j][k][m] = 0;
-                    //SUDHIR
                     reax->fbp[i * __N * __N * __N + j * __N * __N + k * __N + m].cnt = 0;
                     tor_flag[i * __N * __N * __N + j * __N * __N + k * __N + m] = 0;
 
                 }
+            }
+        }
+    }
 
     /* next line is number of 4-body params and some comments */
     fgets( s, MAX_LINE, fp );
@@ -697,16 +651,14 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         k = atoi(tmp[1]) - 1;
         m = atoi(tmp[2]) - 1;
         n = atoi(tmp[3]) - 1;
-
-        //SUDHIR
         index1 = j * __N * __N * __N + k * __N * __N + m * __N + n;
         index2 = n * __N * __N * __N + m * __N * __N + k * __N + j;
 
-
-        if (j >= 0 && n >= 0)   // this means the entry is not in compact form
+        /* this means the entry is not in compact form */
+        if (j >= 0 && n >= 0)
         {
-            if (j < reax->num_atom_types && k < reax->num_atom_types &&
-                    m < reax->num_atom_types && n < reax->num_atom_types)
+            if ( j < reax->num_atom_types && k < reax->num_atom_types &&
+                    m < reax->num_atom_types && n < reax->num_atom_types )
             {
                 /* these flags ensure that this entry take precedence
                    over the compact form entries */
@@ -715,9 +667,6 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
 
                 reax->fbp[index1].cnt = 1;
                 reax->fbp[index2].cnt = 1;
-                /* cnt = reax->fbp[j][k][m][n].cnt;
-                   reax->fbp[j][k][m][n].cnt++;
-                   reax->fbp[n][m][k][j].cnt++; */
 
                 val = atof(tmp[4]);
                 reax->fbp[index1].prm[0].V1 = val;
@@ -740,22 +689,20 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
                 reax->fbp[index2].prm[0].p_cot1 = val;
             }
         }
-        else   /* This means the entry is of the form 0-X-Y-0 */
+        /* This means the entry is of the form 0-X-Y-0 */
+        else
         {
             if ( k < reax->num_atom_types && m < reax->num_atom_types )
+            {
                 for ( p = 0; p < reax->num_atom_types; p++ )
+                {
                     for ( o = 0; o < reax->num_atom_types; o++ )
                     {
-
-                        //SUDHIR
                         index1 = p * __N * __N * __N + k * __N * __N + m * __N + o;
                         index2 = o * __N * __N * __N + m * __N * __N + k * __N + p;
 
                         reax->fbp[index1].cnt = 1;
                         reax->fbp[index2].cnt = 1;
-                        /* cnt = reax->fbp[p][k][m][o].cnt;
-                           reax->fbp[p][k][m][o].cnt++;
-                           reax->fbp[o][m][k][p].cnt++; */
 
                         if (tor_flag[index1] == 0)
                         {
@@ -775,11 +722,11 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
                             reax->fbp[index2].prm[0].p_cot1 = atof(tmp[8]);
                         }
                     }
+                }
+            }
         }
     }
 
-
-
     /* next line is number of hydrogen bond params and some comments */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
@@ -793,10 +740,8 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         j = atoi(tmp[0]) - 1;
         k = atoi(tmp[1]) - 1;
         m = atoi(tmp[2]) - 1;
-        //SUDHIR
         index1 = j * __N * __N + k * __N + m;
 
-
         if ( j < reax->num_atom_types && m < reax->num_atom_types )
         {
             val = atof(tmp[3]);
@@ -813,33 +758,17 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         }
     }
 
-
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
-
-
-    /* deallocate tor_flag */
-    /*
-    for( i = 0; i < reax->num_atom_types; i++ ) {
-      for( j = 0; j < reax->num_atom_types; j++ ) {
-        for( k = 0; k < reax->num_atom_types; k++ )
-    free( tor_flag[i][j][k] );
-
-        free( tor_flag[i][j] );
-      }
-
-      free( tor_flag[i] );
+    {
+        sfree( tmp[i], "READ_FFIELD" );
     }
-    */
-    free( tor_flag );
-
-
+    sfree( tmp, "READ_FFIELD" );
+    sfree( s, "READ_FFIELD" );
+    sfree( tor_flag, "READ_FFIELD" );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "force field read\n" );
+    fprintf( stderr, "p%d: force field read\n", system->my_rank );
 #endif
 
     return SUCCESS;
diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h
index 9aa2a27f69eaee581b2f46d5cbf3db68873a3771..902e8572b5ebadc2800d2af06d0b2a1e99086228 100644
--- a/PG-PuReMD/src/ffield.h
+++ b/PG-PuReMD/src/ffield.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
-char Read_Force_Field( char*, reax_interaction*, control_params* );
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int Read_Force_Field( char*, reax_interaction*, reax_system *, control_params* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index 52092aac8d3059e90e776bde27a4b2f38c0254da..d3e295cf392f45f400596f041f1995f93ed2e02f 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -21,59 +21,47 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-#ifdef HAVE_CUDA
-#include "cuda_forces.h"
-
-#include "cuda_linear_solvers.h"
-#include "cuda_neighbors.h"
-//#include "cuda_bond_orders.h"
-#include "validation.h"
-#endif
-
 #if defined(PURE_REAX)
-#include "forces.h"
-#include "bond_orders.h"
-#include "bonds.h"
-#include "basic_comm.h"
-#include "hydrogen_bonds.h"
-#include "io_tools.h"
-#include "list.h"
-#include "lookup.h"
-#include "multi_body.h"
-#include "nonbonded.h"
-#include "qEq.h"
-#include "tool_box.h"
-#include "torsion_angles.h"
-#include "valence_angles.h"
-#include "vector.h"
+  #include "forces.h"
+  #include "bond_orders.h"
+  #include "bonds.h"
+  #include "basic_comm.h"
+  #include "hydrogen_bonds.h"
+  #include "io_tools.h"
+  #include "list.h"
+  #include "lookup.h"
+  #include "multi_body.h"
+  #include "nonbonded.h"
+  #include "charges.h"
+  #include "tool_box.h"
+  #include "torsion_angles.h"
+  #include "valence_angles.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_forces.h"
-#include "reax_bond_orders.h"
-#include "reax_bonds.h"
-#include "reax_basic_comm.h"
-#include "reax_hydrogen_bonds.h"
-#include "reax_io_tools.h"
-#include "reax_list.h"
-#include "reax_lookup.h"
-#include "reax_multi_body.h"
-#include "reax_nonbonded.h"
-#include "reax_tool_box.h"
-#include "reax_torsion_angles.h"
-#include "reax_valence_angles.h"
-#include "reax_vector.h"
+  #include "reax_forces.h"
+  #include "reax_bond_orders.h"
+  #include "reax_bonds.h"
+  #include "reax_basic_comm.h"
+  #include "reax_hydrogen_bonds.h"
+  #include "reax_io_tools.h"
+  #include "reax_list.h"
+  #include "reax_lookup.h"
+  #include "reax_multi_body.h"
+  #include "reax_nonbonded.h"
+  #include "reax_tool_box.h"
+  #include "reax_torsion_angles.h"
+  #include "reax_valence_angles.h"
+  #include "reax_vector.h"
 #endif
 
-
-#ifdef HAVE_CUDA
-void Cuda_Total_Forces (reax_system *, control_params *, simulation_data *, storage *);
-void Cuda_Total_Forces_PURE (reax_system *, storage *);
-#endif
+#include "index_utils.h"
 
 
 interaction_function Interaction_Functions[NUM_INTRS];
 
 
+/* placeholder for unused interactions in interaction list
+ * Interaction_Functions, which is initialized in Init_Force_Functions */
 void Dummy_Interaction( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control )
@@ -86,11 +74,17 @@ void Init_Force_Functions( control_params *control )
     Interaction_Functions[0] = BO;
     Interaction_Functions[1] = Bonds; //Dummy_Interaction;
     Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
+    Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
     Interaction_Functions[3] = Valence_Angles; //Dummy_Interaction;
     Interaction_Functions[4] = Torsion_Angles; //Dummy_Interaction;
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
+    {
         Interaction_Functions[5] = Hydrogen_Bonds;
-    else Interaction_Functions[5] = Dummy_Interaction;
+    }
+    else
+    {
+        Interaction_Functions[5] = Dummy_Interaction;
+    }
     Interaction_Functions[6] = Dummy_Interaction; //empty
     Interaction_Functions[7] = Dummy_Interaction; //empty
     Interaction_Functions[8] = Dummy_Interaction; //empty
@@ -99,8 +93,8 @@ void Init_Force_Functions( control_params *control )
 
 
 void Compute_Bonded_Forces( reax_system *system, control_params *control,
-                            simulation_data *data, storage *workspace,
-                            reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i;
 
@@ -110,32 +104,27 @@ void Compute_Bonded_Forces( reax_system *system, control_params *control,
 #endif
 
     /* Implement all force calls as function pointers */
-//  for( i = 0; i < NUM_INTRS; i++ ) {
-//#if defined(DEBUG)
-//    fprintf( stderr, "p%d: starting f%d\n", system->my_rank, i );
-//    MPI_Barrier( MPI_COMM_WORLD );
-//#endif
-//    (Interaction_Functions[i])( system, control, data, workspace,
-//              lists, out_control );
-//#if defined(DEBUG)
-//    fprintf( stderr, "p%d: f%d done\n", system->my_rank, i );
-//    MPI_Barrier( MPI_COMM_WORLD );
-//#endif
-//  }
-
-    (Interaction_Functions[0])( system, control, data, workspace, lists, out_control );
-    (Interaction_Functions[1])( system, control, data, workspace, lists, out_control );
-    (Interaction_Functions[2])( system, control, data, workspace, lists, out_control );
-    (Interaction_Functions[3])( system, control, data, workspace, lists, out_control );
-    (Interaction_Functions[4])( system, control, data, workspace, lists, out_control );
-    (Interaction_Functions[5])( system, control, data, workspace, lists, out_control );
+    for( i = 0; i < NUM_INTRS; i++ )
+    {
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: starting f%d\n", system->my_rank, i );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        (Interaction_Functions[i])( system, control, data, workspace, lists, out_control );
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: f%d done\n", system->my_rank, i );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
 }
 
 
 void Compute_NonBonded_Forces( reax_system *system, control_params *control,
-                               simulation_data *data, storage *workspace,
-                               reax_list **lists, output_controls *out_control,
-                               mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     /* Mark beginning of a new timestep in nonbonded energy files */
 #if defined(TEST_ENERGY)
@@ -144,11 +133,13 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 
     /* van der Waals and Coulomb interactions */
     if ( control->tabulate == 0 )
-        vdW_Coulomb_Energy( system, control, data, workspace,
-                            lists, out_control );
+    {
+        vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+    }
     else
-        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
-                                      lists, out_control );
+    {
+        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: nonbonded forces done\n", system->my_rank );
@@ -157,26 +148,33 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 }
 
 
-
 /* this version of Compute_Total_Force computes forces from
    coefficients accumulated by all interaction functions.
    Saves enormous time & space! */
 void Compute_Total_Force( reax_system *system, control_params *control,
-                          simulation_data *data, storage *workspace,
-                          reax_list **lists, mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data )
 {
     int i, pj;
     reax_list *bonds = (*lists) + BONDS;
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             if ( i < bonds->select.bond_list[pj].nbr )
             {
                 if ( control->virial == 0 )
+                {
                     Add_dBond_to_Forces( i, pj, workspace, lists );
+                }
                 else
+                {
                     Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists );
+                }
             }
+        }
+    }
 
     //Print_Total_Force( system, data, workspace );
 #if defined(PURE_REAX)
@@ -185,9 +183,11 @@ void Compute_Total_Force( reax_system *system, control_params *control,
        final values of force on each atom needs to be computed by adding up
        all partially-final pieces */
     Coll( system, mpi_data, workspace->f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
+            sizeof(rvec) / sizeof(void), rvec_unpacker );
     for ( i = 0; i < system->n; ++i )
+    {
         rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
+    }
 
 #if defined(TEST_FORCES)
     Coll( system, mpi_data, workspace->f_ele, mpi_data->mpi_rvec, rvec_unpacker);
@@ -208,44 +208,10 @@ void Compute_Total_Force( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
-                               simulation_data *data, storage *workspace,
-                               reax_list **lists, mpi_datatypes *mpi_data )
-{
-    rvec *f = (rvec *) host_scratch;
-    memset (f, 0, sizeof (rvec) * system->N );
-
-    Cuda_Total_Forces (system, control, data, workspace);
-
-#if defined(PURE_REAX)
-    /* now all forces are computed to their partially-final values
-       based on the neighbors information each processor has had.
-       final values of force on each atom needs to be computed by adding up
-       all partially-final pieces */
-
-    //MVAPICH2
-    get_from_device (f, dev_workspace->f, sizeof (rvec) * system->N , "total_force:f:get");
-
-    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
-
-    put_on_device (f, dev_workspace->f, sizeof (rvec) * system->N, "total_force:f:put" );
-
-    Cuda_Total_Forces_PURE (system, dev_workspace);
-#endif
-
-}
-#endif
-
-
-
 // Essentially no-cuda copies of cuda kernels, to be used only in the mpi-not-gpu version
 ////////////////////////
 // HBOND ISSUE
-void mpi_not_gpu_update_bonds (reax_atom *my_atoms,
-                               reax_list bonds,
-                               int n)
+void mpi_not_gpu_update_bonds( reax_atom *my_atoms, reax_list bonds, int n )
 {
 //    int i = blockIdx.x * blockDim.x + threadIdx.x;
     //  if (i >= n) return;
@@ -258,9 +224,7 @@ void mpi_not_gpu_update_bonds (reax_atom *my_atoms,
 }
 
 
-void mpi_not_gpu_update_hbonds (reax_atom *my_atoms,
-                                reax_list hbonds,
-                                int n)
+void mpi_not_gpu_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
 {
     int Hindex;
     int i;
@@ -274,9 +238,10 @@ void mpi_not_gpu_update_hbonds (reax_atom *my_atoms,
     }
 }
 
+
 // Essentially a copy of cuda_validate_lists, but with all cuda-dependent kernels turned into serial versions
-int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control,
-                                int step, int n, int N, int numH )
+int MPI_Not_GPU_Validate_Lists( reax_system *system, storage *workspace,
+        reax_list **lists, control_params *control, int step, int n, int N, int numH )
 {
     int blocks;
     int i, comp, Hindex;
@@ -289,12 +254,6 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
     int max_sp_entries, num_hbonds, num_bonds;
     int total_sp_entries;
 
-
-
-
-
-
-
     //blocks = system->n / DEF_BLOCK_SIZE +
     //    ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
@@ -331,8 +290,6 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
     //memcpy(index, workspace->H.start, system->N * sizeof (int));
     //memcpy(end_index, workspace->H.end, system->N * sizeof (int));
 
-
-
     // don't need these, everything is already at host
     //copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int),
     //        cudaMemcpyDeviceToHost, "sparse_matrix:start" );
@@ -348,38 +305,37 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
         //    comp = dev_workspace->H.m;
 
         total_sp_entries += end_index [i] - index[i];
-        if (end_index [i] - index[i] > system->max_sparse_entries)
-        {
-            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
-                     step, i, index[i], end_index[i] );
-            return FAILURE;
-        }
-        else if (end_index[i] >= workspace->H.m)
-        {
-            //SUDHIR_FIX_SPARSE_MATRIX
-            //TODO move this carver
-            //TODO move this carver
-            //TODO move this carver
-            fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n",
-                     system->my_rank, step, i, index[i], end_index[i]);
-            //TODO move this carver
-            //TODO move this carver
-            //TODO move this carver
-            return FAILURE;
-        }
-        else
-        {
-            if (max_sp_entries <= end_index[i] - index [i])
-                max_sp_entries = end_index[i] - index [i];
-        }
+//        if (end_index [i] - index[i] > system->max_sparse_entries)
+//        {
+//            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
+//                     step, i, index[i], end_index[i] );
+//            return FAILURE;
+//        }
+//        else if (end_index[i] >= workspace->H.m)
+//        {
+//            //SUDHIR_FIX_SPARSE_MATRIX
+//            //TODO move this carver
+//            fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n",
+//                     system->my_rank, step, i, index[i], end_index[i]);
+//            //TODO move this carver
+//            return FAILURE;
+//        }
+//        else
+//        {
+//            if (max_sp_entries <= end_index[i] - index [i])
+//                max_sp_entries = end_index[i] - index [i];
+//        }
     }
     //if (max_sp_entries <= end_index[i] - index [i])
     //    max_sp_entries = end_index[i] - index [i];
 
     //update the current step max_sp_entries;
     realloc->Htop = max_sp_entries;
+
+#if defined(DEBUG)
     fprintf (stderr, "p:%d - MPI-Not-GPU Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n",
              system->my_rank, workspace->H.n, workspace->H.m, total_sp_entries);
+#endif
 
     if (total_sp_entries >= workspace->H.m)
     {
@@ -389,7 +345,6 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
         return FAILURE;
     }
 
-
     //validate Bond list
     if (N > 0)
     {
@@ -441,10 +396,13 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
         int max_bonds = 0;
         for (i = 0; i < N; i++)
         {
-            if (end_index[i] - index[i] >= system->max_bonds)
+            if (end_index[i] - index[i] >= system->max_bonds[i])
             {
+#if defined(DEBUG)
                 fprintf( stderr, "MPI-Not-GPU step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n",
-                         step, i, index[i], end_index[i], system->max_bonds);
+                        step, i, index[i], end_index[i], system->max_bonds[i]);
+#endif
+
                 return FAILURE;
             }
             if (end_index[i] - index[i] >= max_bonds)
@@ -520,6 +478,7 @@ int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_li
     return SUCCESS;
 }
 
+
 /*
 void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
                      int step, int n, int N, int numH )
@@ -591,7 +550,7 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
 
 
 void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
-                     int step, int n, int N, int numH, MPI_Comm comm )
+        int step, int n, int N, int numH, MPI_Comm comm )
 {
     int i, comp, Hindex;
     reax_list *bonds, *hbonds;
@@ -624,7 +583,6 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
         }
     }
 
-
     /* hbonds list */
     if ( numH > 0 )
     {
@@ -677,16 +635,11 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
                         }
 
             */
-
-
-
         }
     }
 }
 
 
-
-
 #if defined(OLD_VALIDATE)
 void Validate_Lists( storage *workspace, reax_list **lists,
                      int step, int n, int N, int numH )
@@ -735,13 +688,17 @@ void Validate_Lists( storage *workspace, reax_list **lists,
         flag = -1;
         hbonds = *lists + HBONDS;
         for ( i = 0; i < numH - 1; ++i )
+        {
             if ( Num_Entries(i, hbonds) >=
                     (Start_Index(i + 1, hbonds) - Start_Index(i, hbonds)) * 0.90/*DANGER_ZONE*/ )
             {
                 workspace->realloc.hbonds = 1;
                 if ( End_Index(i, hbonds) > Start_Index(i + 1, hbonds) )
+                {
                     flag = i;
+                }
             }
+        }
 
         if ( flag > -1 )
         {
@@ -766,7 +723,7 @@ void Validate_Lists( storage *workspace, reax_list **lists,
 #endif
 
 
-inline real Compute_H( real r, real gamma, real *ctap )
+static inline real Compute_H( real r, real gamma, real *ctap )
 {
     real taper, dr3gamij_1, dr3gamij_3;
 
@@ -779,12 +736,13 @@ inline real Compute_H( real r, real gamma, real *ctap )
     taper = taper * r + ctap[0];
 
     dr3gamij_1 = ( r * r * r + gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
     return taper * EV_to_KCALpMOL / dr3gamij_3;
 }
 
 
-inline real Compute_tabH( real r_ij, int ti, int tj, int num_atom_types )
+static inline real Compute_tabH( real r_ij, int ti, int tj, int num_atom_types )
 {
     int r, tmin, tmax;
     real val, dif, base;
@@ -798,7 +756,10 @@ inline real Compute_tabH( real r_ij, int ti, int tj, int num_atom_types )
 
     /* cubic spline interpolation */
     r = (int)(r_ij * t->inv_dx);
-    if ( r == 0 )  ++r;
+    if ( r == 0 )
+    {
+        ++r;
+    }
     base = (real)(r + 1) * t->dx;
     dif = r_ij - base;
     val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
@@ -809,9 +770,9 @@ inline real Compute_tabH( real r_ij, int ti, int tj, int num_atom_types )
 }
 
 
-void Init_Forces( reax_system *system, control_params *control,
-                  simulation_data *data, storage *workspace,
-                  reax_list **lists, output_controls *out_control )
+int Init_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -831,14 +792,14 @@ void Init_Forces( reax_system *system, control_params *control,
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
 
-    //Print_List(*lists + BONDS);
-
-
     for ( i = 0; i < system->n; ++i )
+    {
         workspace->bond_mark[i] = 0;
+    }
     for ( i = system->n; i < system->N; ++i )
     {
-        workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
+        /* put ghost atoms to an infinite distance */
+        workspace->bond_mark[i] = 1000;
         //workspace->done_after[i] = Start_Index( i, far_nbrs );
     }
 
@@ -870,7 +831,7 @@ void Init_Forces( reax_system *system, control_params *control,
             cutoff = control->bond_cut;
         }
 
-        ihb = -1;
+        ihb = NON_H_BONDING_ATOM;
         ihb_top = -1;
         if ( local )
         {
@@ -879,12 +840,17 @@ void Init_Forces( reax_system *system, control_params *control,
             H->entries[Htop].val = sbp_i->eta;
             ++Htop;
 
-            if ( control->hbond_cut > 0 )
+            if ( control->hbond_cut > 0.0 )
             {
                 ihb = sbp_i->p_hbond;
-                if ( ihb == 1 )
+                if ( ihb == H_ATOM )
+                {
                     ihb_top = End_Index( atom_i->Hindex, hbonds );
-                else ihb_top = -1;
+                }
+                else
+                {
+                    ihb_top = -1;
+                }
             }
         }
 
@@ -894,16 +860,23 @@ void Init_Forces( reax_system *system, control_params *control,
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->my_atoms[j]);
+
             //fprintf( stderr, "%d%d i=%d x_i: %f %f %f,j=%d x_j: %f %f %f, d=%f\n",
             //     MIN(atom_i->orig_id, atom_j->orig_id),
             //     MAX(atom_i->orig_id, atom_j->orig_id),
             //     i, atom_i->x[0], atom_i->x[1], atom_i->x[2],
             //     j, atom_j->x[0], atom_j->x[1], atom_j->x[2], nbr_pj->d );
+
             if ( renbr )
             {
-                if (nbr_pj->d <= cutoff)
-                    flag = 1;
-                else flag = 0;
+                if ( nbr_pj->d <= cutoff )
+                {
+                    flag = TRUE;
+                }
+                else
+                {
+                    flag = FALSE;
+                }
             }
             else
             {
@@ -913,23 +886,21 @@ void Init_Forces( reax_system *system, control_params *control,
                 nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                 if ( nbr_pj->d <= SQR(cutoff) )
                 {
-                    nbr_pj->d = sqrt(nbr_pj->d);
-                    flag = 1;
+                    nbr_pj->d = SQRT( nbr_pj->d );
+                    flag = TRUE;
                 }
                 else
                 {
-                    flag = 0;
+                    flag = FALSE;
                 }
             }
 
-            if ( flag )
+            if ( flag == TRUE )
             {
                 type_j = atom_j->type;
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reax_param.sbp[type_j]);
-                //SUDHIR
-                //twbp = &(system->reax_param.tbp[type_i][type_j]);
-                twbp = &(system->reax_param.tbp[ index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);
+                twbp = &(system->reax_param.tbp[ index_tbp(type_i, type_j, system->reax_param.num_atom_types)]);
 
                 if ( local )
                 {
@@ -937,25 +908,32 @@ void Init_Forces( reax_system *system, control_params *control,
                     if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq||1
                     {
                         H->entries[Htop].j = j;
+
                         //fprintf( stdout, "%d%d %d %d\n",
                         //     MIN(atom_i->orig_id, atom_j->orig_id),
                         //     MAX(atom_i->orig_id, atom_j->orig_id),
                         //     MIN(atom_i->orig_id, atom_j->orig_id),
                         //     MAX(atom_i->orig_id, atom_j->orig_id) );
+
                         if ( control->tabulate == 0 )
+                        {
                             H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+                        }
                         else
+                        {
                             H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j, system->reax_param.num_atom_types);
+                        }
                         ++Htop;
                     }
 
                     /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
+                    if ( control->hbond_cut > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
                             nbr_pj->d <= control->hbond_cut )
                     {
                         // fprintf( stderr, "%d %d\n", atom1, atom2 );
+
                         jhb = sbp_j->p_hbond;
-                        if ( ihb == 1 && jhb == 2 )
+                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                         {
                             hbonds->select.hbond_list[ihb_top].nbr = j;
                             hbonds->select.hbond_list[ihb_top].scl = 1;
@@ -963,7 +941,7 @@ void Init_Forces( reax_system *system, control_params *control,
                             ++ihb_top;
                             ++num_hbonds;
                         }
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        else if ( j < system->n && ihb == H_BONDING_ATOM && jhb == H_ATOM )
                         {
                             jhb_top = End_Index( atom_j->Hindex, hbonds );
                             hbonds->select.hbond_list[jhb_top].nbr = i;
@@ -979,19 +957,22 @@ void Init_Forces( reax_system *system, control_params *control,
                 if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
                     nbr_pj->d <= control->bond_cut &&
                     BOp( workspace, bonds, control->bo_cut,
-                         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) )
+                        i, btop_i, nbr_pj, sbp_i, sbp_j, twbp ) == TRUE )
                 {
                     num_bonds += 2;
                     ++btop_i;
 
                     if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                    {
                         workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                    }
                     else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                     {
                         workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
                         //if( workspace->bond_mark[i] == 1000 )
                         //  workspace->done_after[i] = pj;
                     }
+
                     //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
                     //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
                 }
@@ -1004,9 +985,12 @@ void Init_Forces( reax_system *system, control_params *control,
         if ( local )
         {
             //printf("Htop: %d \n", Htop);
+
             H->end[i] = Htop;
-            if ( ihb == 1 )
+            if ( ihb == H_ATOM )
+            {
                 Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
+            }
         }
     }
 
@@ -1067,37 +1051,16 @@ void Init_Forces( reax_system *system, control_params *control,
              system->my_rank, data->step, Htop, num_bonds, num_hbonds );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
-#if defined( DEBUG )
-    // Print_Bonds( system, bonds, "debugbonds.out" );
-    //  Print_Bond_List2( system, bonds, "pbonds.out" );
-    // Print_Sparse_Matrix( system, H );
-    /*    for ( i = 0; i < H->n; ++i )
-            for ( j = H->start[i]; j < H->end[i]; ++j )
-                fprintf( stderr, "%d %d %.15e\n",
-                         MIN(system->my_atoms[i].orig_id,
-                             system->my_atoms[H->entries[j].j].orig_id),
-                         MAX(system->my_atoms[i].orig_id,
-                             system->my_atoms[H->entries[j].j].orig_id),
-                         H->entries[j].val );*/
-#endif
-    //Print_List(*lists + BONDS);
-
-
-//reax_system *system, storage *workspace, reax_list **lists,
-    //                   int step, int n, int N, int numH )
-
-    /*
-        Validate_Lists( system, workspace, lists, control,
-                        data->step, system->n, system->N, system->numH );*/
-
-    MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
-                                data->step, system->n, system->N, system->numH );
 
+//    return Validate_Lists( system, workspace, lists, control,
+//            data->step, system->n, system->N, system->numH );
 
+    return MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
+            data->step, system->n, system->N, system->numH );
 }
 
 
-void Init_Forces_noQEq( reax_system *system, control_params *control,
+int Init_Forces_No_Charges( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control )
 {
@@ -1119,7 +1082,9 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
     hbonds = *lists + HBONDS;
 
     for ( i = 0; i < system->n; ++i )
+    {
         workspace->bond_mark[i] = 0;
+    }
     for ( i = system->n; i < system->N; ++i )
     {
         workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
@@ -1151,14 +1116,19 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
             cutoff = control->bond_cut;
         }
 
-        ihb = -1;
+        ihb = NON_H_BONDING_ATOM;
         ihb_top = -1;
         if ( local && control->hbond_cut > 0 )
         {
             ihb = sbp_i->p_hbond;
-            if ( ihb == 1 )
+            if ( ihb == H_ATOM )
+            {
                 ihb_top = End_Index( atom_i->Hindex, hbonds );
-            else ihb_top = -1;
+            }
+            else
+            {
+                ihb_top = -1;
+            }
         }
 
         /* update i-j distance - check if j is within cutoff */
@@ -1171,8 +1141,13 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
             if ( renbr )
             {
                 if ( nbr_pj->d <= cutoff )
+                {
                     flag = 1;
-                else flag = 0;
+                }
+                else
+                {
+                    flag = 0;
+                }
             }
             else
             {
@@ -1182,7 +1157,7 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
                 nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                 if ( nbr_pj->d <= SQR(cutoff) )
                 {
-                    nbr_pj->d = sqrt(nbr_pj->d);
+                    nbr_pj->d = SQRT(nbr_pj->d);
                     flag = 1;
                 }
                 else
@@ -1203,12 +1178,12 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
                 if ( local )
                 {
                     /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
+                    if ( control->hbond_cut > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
                             nbr_pj->d <= control->hbond_cut )
                     {
                         // fprintf( stderr, "%d %d\n", atom1, atom2 );
                         jhb = sbp_j->p_hbond;
-                        if ( ihb == 1 && jhb == 2 )
+                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                         {
                             hbonds->select.hbond_list[ihb_top].nbr = j;
                             hbonds->select.hbond_list[ihb_top].scl = 1;
@@ -1216,7 +1191,7 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
                             ++ihb_top;
                             ++num_hbonds;
                         }
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        else if ( j < system->n && ihb == H_BONDING_ATOM && jhb == H_ATOM )
                         {
                             jhb_top = End_Index( atom_j->Hindex, hbonds );
                             hbonds->select.hbond_list[jhb_top].nbr = i;
@@ -1253,7 +1228,7 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
         }
 
         Set_End_Index( i, btop_i, bonds );
-        if ( local && ihb == 1 )
+        if ( local && ihb == H_ATOM )
         {
             Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
         }
@@ -1284,12 +1259,13 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
     Print_Bond_List2( system, bonds, "pbonds.out" );
 #endif
 
-    MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
+    return MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
             data->step, system->n, system->N, system->numH );
 }
 
-void Host_Estimate_Sparse_Matrix(reax_atom *my_atoms, control_params *control,
-                                  reax_list p_far_nbrs, int n, int N, int renbr, int *indices)
+
+void Host_Estimate_Sparse_Matrix( reax_atom *my_atoms, control_params *control,
+        reax_list p_far_nbrs, int n, int N, int renbr, int *indices )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -1341,12 +1317,10 @@ void Host_Estimate_Sparse_Matrix(reax_atom *my_atoms, control_params *control,
                 }
                 nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                 //TODO
-                //TODO
-                //TODO
                 //if( nbr_pj->d <= (cutoff) ) {
                 if ( nbr_pj->d <= SQR(cutoff) )
                 {
-                    nbr_pj->d = sqrt(nbr_pj->d);
+                    nbr_pj->d = SQRT(nbr_pj->d);
                     flag = 1;
                 }
                 else
@@ -1382,11 +1356,17 @@ void Host_Estimate_Sparse_Matrix(reax_atom *my_atoms, control_params *control,
 
                 //this is the working condition
                 if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
+                {
                     indices [i]++;
+                }
                 else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
+                {
                     indices [i] ++;
+                }
                 else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
+                {
                     indices [i] ++;
+                }
             }
         }
     }
@@ -1395,14 +1375,14 @@ void Host_Estimate_Sparse_Matrix(reax_atom *my_atoms, control_params *control,
 
 #ifdef HAVE_CUDA
 void Estimate_Storages( reax_system *system, control_params *control,
-                        reax_list **lists, int *Htop,
-                        int *hb_top, int *bond_top, int *num_3body )
+        reax_list **lists, int *Htop, int *hb_top, int *bond_top, int *num_3body )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
     int ihb, jhb;
     int local;
+    int hbond_count, bond_count;
     real cutoff;
     real r_ij, r2;
     real C12, C34, C56;
@@ -1430,16 +1410,16 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
         if ( i < system->n )
         {
-            local = 1;
+            local = TRUE;
             cutoff = control->nonb_cut;
             ++(*Htop);
             ihb = sbp_i->p_hbond;
         }
         else
         {
-            local = 0;
+            local = FALSE;
             cutoff = control->bond_cut;
-            ihb = -1;
+            ihb = NON_H_BONDING_ATOM;
         }
 
         for ( pj = start_i; pj < end_i; ++pj )
@@ -1457,19 +1437,21 @@ void Estimate_Storages( reax_system *system, control_params *control,
                 //twbp = &(system->reax_param.tbp[type_i][type_j]);
                 twbp = &(system->reax_param.tbp[index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);
 
-                if ( local )
+                if ( local == TRUE )
                 {
                     if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
                         ++(*Htop);
 
 
-                    if ( control->hbond_cut > 0.1 && (ihb == 1 || ihb == 2) &&
+                    if ( control->hbond_cut > 0.1 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
                             nbr_pj->d <= control->hbond_cut )
                     {
                         jhb = sbp_j->p_hbond;
-                        if ( ihb == 1 && jhb == 2 )
+                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                        {
                             ++hb_top[i];
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        }
+                        else if ( j < system->n && ihb == H_BONDING_ATOM && jhb == H_ATOM )
                         {
                             ++hb_top[j];
 
@@ -1516,19 +1498,18 @@ void Estimate_Storages( reax_system *system, control_params *control,
         }
     }
 
-    fprintf (stderr, "HOST SPARSE MATRIX ENTRIES: %d \n",  *Htop );
-    *Htop = MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES );
+    fprintf( stderr, "HOST SPARSE MATRIX ENTRIES: %d \n",  *Htop );
+    *Htop = MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_CM_ENTRIES );
 
-
-    int hbond_count = 0;
+    hbond_count = 0;
     for ( i = 0; i < system->n; ++i )
     {
         hbond_count += hb_top[i];
         hb_top[i] = MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS );
     }
-    fprintf (stderr, "HOST HBOND COUNT: %d \n", hbond_count);
+    fprintf( stderr, "HOST HBOND COUNT: %d \n", hbond_count );
 
-    int bond_count = 0;
+    bond_count = 0;
     for ( i = 0; i < system->N; ++i )
     {
         bond_count += bond_top[i];
@@ -1549,8 +1530,8 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
 #else
 void Estimate_Storages( reax_system *system, control_params *control,
-                        reax_list **lists, int *Htop, int *hb_top,
-                        int *bond_top, int *num_3body)
+        reax_list **lists, int *Htop, int *hb_top, int *bond_top,
+        int *num_3body )
 {
 
     int i, j, pj;
@@ -1584,16 +1565,16 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
         if ( i < system->n )
         {
-            local = 1;
+            local = TRUE;
             cutoff = control->nonb_cut;
             ++(*Htop);
             ihb = sbp_i->p_hbond;
         }
         else
         {
-            local = 0;
+            local = FALSE;
             cutoff = control->bond_cut;
-            ihb = -1;
+            ihb = NON_H_BONDING_ATOM;
         }
 
         for ( pj = start_i; pj < end_i; ++pj )
@@ -1609,22 +1590,26 @@ void Estimate_Storages( reax_system *system, control_params *control,
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reax_param.sbp[type_j]);
                 //twbp = &(system->reax_param.tbp[type_i][type_j]);
-                twbp = &(system->reax_param.tbp[index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);
+                twbp = &(system->reax_param.tbp[index_tbp(type_i, type_j, system->reax_param.num_atom_types)]);
 
-                if ( local )
+                if ( local == TRUE )
                 {
                     if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
                         ++(*Htop);
 
                     /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0.1 && (ihb == 1 || ihb == 2) &&
+                    if ( control->hbond_cut > 0.1 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
                             nbr_pj->d <= control->hbond_cut )
                     {
                         jhb = sbp_j->p_hbond;
-                        if ( ihb == 1 && jhb == 2 )
+                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                        {
                             ++hb_top[i];
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        }
+                        else if ( j < system->n && ihb == H_BONDING_ATOM && jhb == H_ATOM )
+                        {
                             ++hb_top[j];
+                        }
                     }
                 }
 
@@ -1636,21 +1621,21 @@ void Estimate_Storages( reax_system *system, control_params *control,
                     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                     {
                         C12 = twbp->p_bo1 * pow( r_ij / twbp->r_s, twbp->p_bo2 );
-                        BO_s = (1.0 + control->bo_cut) * exp( C12 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
                     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                     {
                         C34 = twbp->p_bo3 * pow( r_ij / twbp->r_p, twbp->p_bo4 );
-                        BO_pi = exp( C34 );
+                        BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
                     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                     {
                         C56 = twbp->p_bo5 * pow( r_ij / twbp->r_pp, twbp->p_bo6 );
-                        BO_pi2 = exp( C56 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
@@ -1667,13 +1652,15 @@ void Estimate_Storages( reax_system *system, control_params *control,
         }
     }
 
-    *Htop = (int)(MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES ));
+    *Htop = (int)(MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_CM_ENTRIES ));
 
     // Set max sparse entries, needed for first iteration of validate_list
-    system->max_sparse_entries = *Htop * SAFE_ZONE;
+    system->total_cm_entries = *Htop * SAFE_ZONE;
 
     for ( i = 0; i < system->n; ++i )
+    {
         hb_top[i] = (int)(MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS ));
+    }
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -1685,123 +1672,19 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ estimate storages: Htop = %d, num_3body = %d\n",
-             system->my_rank, *Htop, *num_3body );
+            system->my_rank, *Htop, *num_3body );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
 }
 #endif
 
-void Compute_Forces( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control,
-                     mpi_datatypes *mpi_data )
-{
-    int qeq_flag;
-#if defined(LOG_PERFORMANCE)
-    real t_start = 0;
-
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        t_start = Get_Time( );
-#endif
-
-    /********* init forces ************/
-    if ( control->qeq_freq && (data->step - data->prev_steps) % control->qeq_freq == 0 )
-        qeq_flag = 1;
-    else qeq_flag = 0;
-
-    if ( qeq_flag )
-        Init_Forces( system, control, data, workspace, lists, out_control );
-    else
-        Init_Forces_noQEq( system, control, data, workspace, lists, out_control );
-
-#if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.init_forces) );
-#endif
-
-
-    /********* bonded interactions ************/
-    Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-
-#if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
-#endif
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: completed bonded\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-
-
-    /**************** qeq ************************/
-#if defined(PURE_REAX)
-    if ( qeq_flag )
-        QEq( system, control, data, workspace, out_control, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.qEq) );
-#endif
-#if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-#endif //PURE_REAX
-
-
 
-
-    /********* nonbonded interactions ************/
-    Compute_NonBonded_Forces( system, control, data, workspace,
-                              lists, out_control, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.nonb) );
-#endif
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-
-    /*********** total force ***************/
-    Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
-#endif
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: total forces computed\n",
-             system->my_rank, data->step );
-    //Print_Total_Force( system, data, workspace );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-#if defined(TEST_FORCES)
-    Print_Force_Files( system, control, data, workspace,
-                       lists, out_control, mpi_data );
-#endif
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Compute_Forces( reax_system *system, control_params *control,
-                          simulation_data *data, storage *workspace, reax_list **lists,
-                          output_controls *out_control, mpi_datatypes *mpi_data )
+int Compute_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
-    int qeq_flag, retVal = SUCCESS;
-
+    int charge_flag, ret;
 #if defined(LOG_PERFORMANCE)
     real t_start = 0;
 
@@ -1813,30 +1696,24 @@ void Cuda_Compute_Forces( reax_system *system, control_params *control,
 #endif
 
     /********* init forces ************/
-    if ( control->qeq_freq && (data->step - data->prev_steps) % control->qeq_freq == 0 )
+    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
     {
-        qeq_flag = 1;
+        charge_flag = TRUE;
     }
     else
     {
-        qeq_flag = 0;
+        charge_flag = FALSE;
     }
 
-    if ( qeq_flag )
+    if ( charge_flag == TRUE )
     {
-        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
+        ret = Init_Forces( system, control, data, workspace, lists, out_control );
     }
     else
     {
-        retVal = Cuda_Init_Forces_noQEq( system, control, data, workspace, lists, out_control );
+        ret = Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
     }
 
-    if ( retVal == FAILURE )
-    {
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-    //validate_sparse_matrix (system, workspace);
-
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( MPI_COMM_WORLD );
     if ( system->my_rank == MASTER_NODE )
@@ -1845,82 +1722,85 @@ void Cuda_Compute_Forces( reax_system *system, control_params *control,
     }
 #endif
 
-
-    /********* bonded interactions ************/
-    retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-    if (retVal == FAILURE)
+    if ( ret == SUCCESS )
     {
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+        /********* bonded interactions ************/
+        Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
 
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
-    }
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
 #endif
-
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: completed bonded\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: completed bonded\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    /**************** qeq ************************/
+    /**************** charges ************************/
 #if defined(PURE_REAX)
-    if ( qeq_flag )
-    {
-        Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
-    }
+        if ( charge_flag == TRUE )
+        {
+            QEq( system, control, data, workspace, out_control, mpi_data );
+        }
 
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &(data->timing.qEq) );
-    }
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.cm) );
+        }
 #endif
-
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 #endif //PURE_REAX
-
-
-    /********* nonbonded interactions ************/
-    Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
-                                   lists, out_control, mpi_data );
-
+    
+        /********* nonbonded interactions ************/
+        Compute_NonBonded_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.nonb) );
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.nonb) );
+        }
 #endif
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
-
-    /*********** total force ***************/
-    Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
-
+    
+        /*********** total force ***************/
+        Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
+    
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
 #endif
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: total forces computed\n",
-             system->my_rank, data->step );
-    //Print_Total_Force( system, data, workspace );
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: total forces computed\n",
+                 system->my_rank, data->step );
+        //Print_Total_Force( system, data, workspace );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-}
+#if defined(TEST_FORCES)
+        Print_Force_Files( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
+    }
+
+    return ret;
+}
 
 
 int validate_device( reax_system *system, simulation_data *data,
@@ -1928,9 +1808,7 @@ int validate_device( reax_system *system, simulation_data *data,
 {
     int retval = FAILURE;
 
-#ifdef __CUDA_DEBUG__
-
-
+#if defined(__CUDA_DEBUG__)
     //retval |= validate_neighbors (system, lists);
     //retval |= validate_sym_dbond_indices (system, workspace, lists);
     //retval |= validate_hbonds (system, workspace, lists);
@@ -1944,7 +1822,7 @@ int validate_device( reax_system *system, simulation_data *data,
 
     if (!retval)
     {
-        fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
+        fprintf( stderr, "Result *DOES NOT* match between device and host\n" );
     }
 #endif
 
diff --git a/PG-PuReMD/src/forces.h b/PG-PuReMD/src/forces.h
index 4cf538274963034b118f82243412f643f61daad1..0579f092de4a046e6e91d4e41635de22616b4ea8 100644
--- a/PG-PuReMD/src/forces.h
+++ b/PG-PuReMD/src/forces.h
@@ -24,17 +24,27 @@
 
 #include "reax_types.h"
 
+
 extern interaction_function Interaction_Functions[NUM_INTRS];
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Init_Force_Functions( control_params* );
-void Compute_Forces( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+int Compute_Forces( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
 void Estimate_Storages( reax_system*, control_params*, reax_list**,
-                        int*, int*, int*, int* );
+        int*, int*, int*, int* );
 
-void Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
-                          storage*, reax_list**, output_controls*, mpi_datatypes* );
+int validate_device( reax_system *, simulation_data *, storage *, reax_list ** );
+
+#ifdef __cplusplus
+}
+#endif
 
-int validate_device (reax_system *, simulation_data *, storage *, reax_list **);
 
 #endif
diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c
index 0f257f88616a7a6358aa5c154e6abd54fc5e5b95..dff292e74019a18c78daa34519bd15b5246d0224 100644
--- a/PG-PuReMD/src/geo_tools.c
+++ b/PG-PuReMD/src/geo_tools.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "geo_tools.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
@@ -127,7 +130,7 @@ char Read_Geo( char* geo_file, reax_system* system, control_params *control,
                 element[j] = toupper( element[j] );
             //CHAD FIX
             atom->type = Get_Atom_Type( &(system->reax_param), element );
-            strcpy( atom->name, name );
+            strncpy( atom->name, name, MAX_ATOM_NAME_LEN );
             rvec_Copy( atom->x, x );
             rvec_MakeZero( atom->v );
             rvec_MakeZero( atom->f );
@@ -254,22 +257,21 @@ void Count_PDB_Atoms( FILE *geo, reax_system *system )
 
 
 char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
-               simulation_data *data, storage *workspace,
-               mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
 {
 
-    FILE  *pdb;
+    FILE *pdb;
     char **tmp;
-    char  *s, *s1;
-    char   descriptor[9], serial[9];
-    char   atom_name[9], res_name[9], res_seq[9];
-    char   s_x[9], s_y[9], s_z[9];
-    char   occupancy[9], temp_factor[9];
-    char   seg_id[9], element[9], charge[9];
-    char   alt_loc, chain_id, icode;
-    char  *endptr = NULL;
-    int    i, c, c1, pdb_serial, top;
-    rvec   x;
+    char *s, *s1;
+    char descriptor[9], serial[9];
+    char atom_name[9], res_name[9], res_seq[9];
+    char s_x[9], s_y[9], s_z[9];
+    char occupancy[9], temp_factor[9];
+    char seg_id[9], element[9], charge[9];
+    char alt_loc, chain_id, icode;
+    char *endptr = NULL;
+    int i, c, c1, pdb_serial, top;
+    rvec x;
     reax_atom *atom;
 
 
@@ -281,12 +283,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
     }
 
     /* allocate memory for tokenizing pdb lines */
-    if ( Allocate_Tokenizer_Space( &s, &s1, &tmp ) == FAILURE )
-    {
-        fprintf( stderr, "Allocate_Tokenizer_Space: not enough memory!" );
-        fprintf( stderr, "terminating...\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Allocate_Tokenizer_Space( &s, &s1, &tmp );
 
     /* read box information */
     if ( Read_Box_Info( system, pdb, PDB ) == FAILURE )
@@ -398,8 +395,8 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
 
             /* if the point is inside my_box, add it to my lists */
             Make_Point( strtod( &s_x[0], &endptr ),
-                        strtod( &s_y[0], &endptr ),
-                        strtod( &s_z[0], &endptr ), &x );
+                    strtod( &s_y[0], &endptr ),
+                    strtod( &s_z[0], &endptr ), &x );
 
             Fit_to_Periodic_Box( &(system->big_box), &x );
 
@@ -412,7 +409,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
 
                 Trim_Spaces( element );
                 atom->type = Get_Atom_Type( &(system->reax_param), element );
-                strcpy( atom->name, atom_name );
+                strncpy( atom->name, atom_name, MAX_ATOM_NAME_LEN );
 
                 rvec_Copy( atom->x, x );
                 rvec_MakeZero( atom->v );
@@ -530,15 +527,15 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
     if (me == MASTER_NODE)
     {
         /* Writing Box information */
-        gamma = acos( (system->big_box.box[0][0] * system->big_box.box[1][0] +
+        gamma = ACOS( (system->big_box.box[0][0] * system->big_box.box[1][0] +
                        system->big_box.box[0][1] * system->big_box.box[1][1] +
                        system->big_box.box[0][2] * system->big_box.box[1][2]) /
                       (system->big_box.box_norms[0] * system->big_box.box_norms[1]) );
-        beta  = acos( (system->big_box.box[0][0] * system->big_box.box[2][0] +
+        beta  = ACOS( (system->big_box.box[0][0] * system->big_box.box[2][0] +
                        system->big_box.box[0][1] * system->big_box.box[2][1] +
                        system->big_box.box[0][2] * system->big_box.box[2][2]) /
                       (system->big_box.box_norms[0] * system->big_box.box_norms[2]) );
-        alpha = acos( (system->big_box.box[2][0] * system->big_box.box[1][0] +
+        alpha = ACOS( (system->big_box.box[2][0] * system->big_box.box[1][0] +
                        system->big_box.box[2][1] * system->big_box.box[1][1] +
                        system->big_box.box[2][2] * system->big_box.box[1][2]) /
                       (system->big_box.box_norms[2] * system->big_box.box_norms[1]) );
@@ -616,8 +613,8 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
     }
     */
 
-    free(buffer);
-    free(line);
+    sfree( buffer, "Write_PDB::buffer" );
+    sfree( line, "Write_PDB::line" );
 
     return SUCCESS;
 }
diff --git a/PG-PuReMD/src/geo_tools.h b/PG-PuReMD/src/geo_tools.h
index 8078685689afa1d6edbe7b4534dd3bc65c45d1c5..628e8f74e6d6f484cfb76a544dbef4dc5c6aeb8e 100644
--- a/PG-PuReMD/src/geo_tools.h
+++ b/PG-PuReMD/src/geo_tools.h
@@ -29,10 +29,6 @@
 // CUSTOM ATOM: serial element name x y z
 #define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf"
 
-char Read_Geo( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
-
-
 /*PDB format :
 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html
 
@@ -114,10 +110,23 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 #define PDB_ATOM_FORMAT_O_LENGTH 81
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+char Read_Geo( char*, reax_system*, control_params*,
+        simulation_data*, storage*, mpi_datatypes* );
+
 char Read_PDB( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 char Write_PDB( reax_system*, reax_list*, simulation_data*,
-                control_params*, mpi_datatypes*, output_controls* );
+        control_params*, mpi_datatypes*, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/grid.c b/PG-PuReMD/src/grid.c
index 2a423be3b5187b893b6fd99152181dd05da93c11..7f3ef23144dc9b7dd64929b1270dbd348bef79a5 100644
--- a/PG-PuReMD/src/grid.c
+++ b/PG-PuReMD/src/grid.c
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "grid.h"
+
 #include "allocate.h"
+#include "index_utils.h"
 #include "io_tools.h"
 #include "reset_tools.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 /* determines the exchange boundaries with nbrs in terms of gcells */
 void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
@@ -40,28 +42,34 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
 
     /* clear all gcell type info */
     for ( x = 0; x < g->ncells[0]; x++ )
+    {
         for ( y = 0; y < g->ncells[1]; y++ )
+        {
             for ( z = 0; z < g->ncells[2]; z++ )
-                //SUDHIR
-                //g->cells[x][y][z].type = 0;
-                g->cells[ index_grid_3d (x, y, z, g) ].type = 0;
+            {
+                g->cells[ index_grid_3d(x, y, z, g) ].type = 0;
+            }
+        }
+    }
 
     /* mark native cells */
     for ( x = g->native_str[0]; x < g->native_end[0]; x++ )
+    {
         for ( y = g->native_str[1]; y < g->native_end[1]; y++ )
+        {
             for ( z = g->native_str[2]; z < g->native_end[2]; z++ )
             {
-                //SUDHIR
-                //g->cells[x][y][z].type = NATIVE;
-                //ivec_MakeZero( g->cells[x][y][z].rel_box );
-                g->cells[ index_grid_3d (x, y, z, g) ].type = NATIVE;
-                //ivec_MakeZero( g->cells[ index_grid_3d(x, y, z, g) ].rel_box );
+                g->cells[ index_grid_3d(x, y, z, g) ].type = NATIVE;
                 ivec_MakeZero( g->rel_box[ index_grid_3d(x, y, z, g) ]);
             }
+        }
+    }
 
     /* loop over neighbors */
     for ( r[0] = -1; r[0] <= 1; ++r[0])
+    {
         for ( r[1] = -1; r[1] <= 1; ++r[1] )
+        {
             for ( r[2] = -1; r[2] <= 1; ++r[2] )
             {
                 /* determine the width of exchange with nbr_pr */
@@ -73,9 +81,18 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
                 for ( d = 0; d < 3; ++d )
                 {
                     /* determine the periodicity of this neighbor */
-                    if ( nbr_coord[d] < 0 ) prdc[d] = -1;
-                    else if ( nbr_coord[d] >= procs[d] ) prdc[d] = +1;
-                    else prdc[d] = 0;
+                    if ( nbr_coord[d] < 0 )
+                    {
+                        prdc[d] = -1;
+                    }
+                    else if ( nbr_coord[d] >= procs[d] )
+                    {
+                        prdc[d] = +1;
+                    }
+                    else
+                    {
+                        prdc[d] = 0;
+                    }
 
                     /* determine gcells to be sent & recv'd */
                     if ( r[d] == -1 )
@@ -92,7 +109,7 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
                         str_recv[d] = g->native_str[d];
                         end_recv[d] = g->native_end[d];
                     }
-                    else   // r[d] == +1
+                    else
                     {
                         str_send[d] = g->native_end[d] - send_span[d];
                         end_send[d] = g->native_end[d];
@@ -102,13 +119,18 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
                 }
 
                 for ( x = str_recv[0]; x < end_recv[0]; ++x )
+                {
                     for ( y = str_recv[1]; y < end_recv[1]; ++y )
+                    {
                         for ( z = str_recv[2]; z < end_recv[2]; ++z )
-                            //SUDHIR
-                            //ivec_Copy( g->cells[x][y][z].rel_box, prdc );
-                            //ivec_Copy( g->cells[ index_grid_3d(x, y, z, g) ].rel_box, prdc );
+                        {
                             ivec_Copy( g->rel_box[ index_grid_3d(x, y, z, g) ], prdc );
+                        }
+                    }
+                }
             }
+        }
+    }
 }
 
 
@@ -116,22 +138,24 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
    periodic boundary conditions are taken into consideration as well. */
 void Find_Closest_Point( grid *g, ivec c1, ivec c2, rvec closest_point )
 {
-    int  i, d;
+    int i, d;
 
     for ( i = 0; i < 3; i++ )
     {
         d = c2[i] - c1[i];
 
         if ( d > 0 )
-            //SUDHIR
-            //closest_point[i] = g->cells[c2[0]][c2[1]][c2[2]].min[i];
+        {
             closest_point[i] = g->cells[ index_grid_3d(c2[0], c2[1], c2[2], g) ].min[i];
+        }
         else if ( d == 0 )
+        {
             closest_point[i] = NEG_INF - 1.;
+        }
         else
-            //SUDHIR
-            //closest_point[i] = g->cells[c2[0]][c2[1]][c2[2]].max[i];
+        {
             closest_point[i] = g->cells[ index_grid_3d(c2[0], c2[1], c2[2], g) ].max[i];
+        }
     }
 }
 
@@ -143,9 +167,6 @@ void Find_Neighbor_GridCells( grid *g, control_params *control )
     ivec ci, cj, cmin, cmax, span;
     grid_cell *gc;
 
-    //TODO
-    //TODO
-    //TODO
     //TODO
     //fprintf (stderr, " !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
     //fprintf (stderr, " CHANGED TO WORK NEIGHBOR LISTS \n");
@@ -153,65 +174,61 @@ void Find_Neighbor_GridCells( grid *g, control_params *control )
     //fprintf (stderr, " !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
     //fprintf (stderr, " vlist_cut: %f \n", control->vlist_cut);
     //fprintf (stderr, " bond_cut: %f \n", control->bond_cut);
-    //TODO
-    //TODO
-    //TODO
-    //TODO
 
     /* pick up a cell in the grid */
     for ( ci[0] = 0; ci[0] < g->ncells[0]; ci[0]++ )
+    {
         for ( ci[1] = 0; ci[1] < g->ncells[1]; ci[1]++ )
+        {
             for ( ci[2] = 0; ci[2] < g->ncells[2]; ci[2]++ )
             {
-                //SUDHIR
-                //gc = &(g->cells[ci[0]][ci[1]][ci[2]]);
                 gc = &(g->cells[ index_grid_3d(ci[0], ci[1], ci[2], g) ]);
                 top = 0;
                 //fprintf( stderr, "grid1: %d %d %d:\n", ci[0], ci[1], ci[2] );
 
-                //TODO
-                /////////////////////////////////////////////////////////////////
-                //if( gc->type == NATIVE )
-                // gc->cutoff = control->vlist_cut;
-                //else gc->cutoff = control->bond_cut;
-                //gc->cutoff = control->vlist_cut;
-                if (gc->type == NATIVE)
-                    g->cutoff [index_grid_3d (ci[0], ci[1], ci[2], g)] = control->vlist_cut;
+                if ( gc->type == NATIVE )
+                {
+                    g->cutoff[index_grid_3d(ci[0], ci[1], ci[2], g)] = control->vlist_cut;
+                }
                 else
-                    g->cutoff [index_grid_3d (ci[0], ci[1], ci[2], g)] = control->bond_cut;
-
-                /////////////////////////////////////////////////////////////////
-                //TODO
-
+                {
+                    g->cutoff[index_grid_3d(ci[0], ci[1], ci[2], g)] = control->bond_cut;
+                }
 
                 for ( d = 0; d < 3; ++d )
                 {
                     //span[d] = (int)ceil( gc->cutoff / g->cell_len[d] );
-                    span[d] = (int)ceil( control->vlist_cut / g->cell_len[d] );
-                    cmin[d] = MAX(ci[d] - span[d], 0 );
-                    cmax[d] = MIN(ci[d] + span[d] + 1, g->ncells[d] );
+                    span[d] = (int)CEIL( control->vlist_cut / g->cell_len[d] );
+                    cmin[d] = MAX( ci[d] - span[d], 0 );
+                    cmax[d] = MIN( ci[d] + span[d] + 1, g->ncells[d] );
                 }
 
                 /* loop over neighboring gcells */
                 for ( cj[0] = cmin[0]; cj[0] < cmax[0]; ++cj[0] )
+                {
                     for ( cj[1] = cmin[1]; cj[1] < cmax[1]; ++cj[1] )
+                    {
                         for ( cj[2] = cmin[2]; cj[2] < cmax[2]; ++cj[2] )
                         {
                             //fprintf( stderr, "\tgrid2: %d %d %d (%d - %d) - ", cj[0], cj[1], cj[2], top, g->max_nbrs );
                             //SUDHIR
                             //gc->nbrs[top] = &(g->cells[cj[0]][cj[1]][cj[2]]);
                             //gc->nbrs[top] = &(g->cells[ index_grid_3d(cj[0],cj[1],cj[2],g) ]);
-                            ivec_Copy( g->nbrs_x[index_grid_nbrs (ci[0], ci[1], ci[2], top, g)], cj );
-                            //fprintf (stderr, " index: %d - %d \n", index_grid_nbrs (ci[0], ci[1], ci[2], top, g), g->total * g->max_nbrs);
+                            ivec_Copy( g->nbrs_x[index_grid_nbrs(ci[0], ci[1], ci[2], top, g)], cj );
+                            //fprintf( stderr, " index: %d - %d \n", index_grid_nbrs (ci[0], ci[1], ci[2], top, g), g->total * g->max_nbrs );
                             Find_Closest_Point( /*ext_box,*/ g, ci, cj, g->nbrs_cp[index_grid_nbrs (ci[0], ci[1], ci[2], top, g)] );
                             //fprintf( stderr, "cp: %f %f %f\n",
                             //       gc->nbrs_cp[top][0], gc->nbrs_cp[top][1],
                             //       gc->nbrs_cp[top][2] );
                             ++top;
                         }
+                    }
+                }
                 //gc->nbrs[top] = NULL;
                 //fprintf( stderr, "top=%d\n", top );
             }
+        }
+    }
 }
 
 
@@ -220,26 +237,36 @@ void Reorder_GridCells( grid *g )
     int i, j, k, x, y, z, top;
     ivec dblock, nblocks;
 
-    dblock[0] = 1; //3; //4; //(int)(ceil( sqrt(g->ncells[0]) ));
-    dblock[1] = 1; //3; //4; //(int)(ceil( sqrt(g->ncells[1]) ));
-    dblock[2] = 1; //3; //4; //(int)(ceil( sqrt(g->ncells[2]) ));
-    nblocks[0] = (int)(ceil( (real)g->ncells[0] / dblock[0] ));
-    nblocks[1] = (int)(ceil( (real)g->ncells[1] / dblock[1] ));
-    nblocks[2] = (int)(ceil( (real)g->ncells[2] / dblock[2] ));
+    dblock[0] = 1; //3; //4; //(int)(CEIL( SQRT(g->ncells[0]) ));
+    dblock[1] = 1; //3; //4; //(int)(CEIL( SQRT(g->ncells[1]) ));
+    dblock[2] = 1; //3; //4; //(int)(CEIL( SQRT(g->ncells[2]) ));
+    nblocks[0] = (int)(CEIL( (real)g->ncells[0] / dblock[0] ));
+    nblocks[1] = (int)(CEIL( (real)g->ncells[1] / dblock[1] ));
+    nblocks[2] = (int)(CEIL( (real)g->ncells[2] / dblock[2] ));
 
     top = 0;
     for ( i = 0; i < nblocks[0]; ++i )
+    {
         for ( j = 0; j < nblocks[1]; ++j )
+        {
             for ( k = 0; k < nblocks[2]; ++k )
-                for ( x = i * dblock[0]; x < MIN((i + 1)*dblock[0], g->ncells[0]); ++x )
-                    for ( y = j * dblock[1]; y < MIN((j + 1)*dblock[1], g->ncells[1]); ++y )
-                        for ( z = k * dblock[2]; z < MIN((k + 1)*dblock[2], g->ncells[2]); ++z )
+            {
+                for ( x = i * dblock[0]; x < MIN((i + 1) * dblock[0], g->ncells[0]); ++x )
+                {
+                    for ( y = j * dblock[1]; y < MIN((j + 1) * dblock[1], g->ncells[1]); ++y )
+                    {
+                        for ( z = k * dblock[2]; z < MIN((k + 1) * dblock[2], g->ncells[2]); ++z )
                         {
                             g->order[top][0] = x;
                             g->order[top][1] = y;
                             g->order[top][2] = z;
                             ++top;
                         }
+                    }
+                }
+            }
+        }
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "reorder_gcells: total_gcells=%d top=%d\n", g->total, top );
@@ -247,23 +274,26 @@ void Reorder_GridCells( grid *g )
     fprintf( stderr, "nblocks: %d %d %d\n", nblocks[0], nblocks[1], nblocks[2] );
     fprintf( stderr, "reordered gcells:\n" );
     for ( i = 0; i < top; ++i )
+    {
         fprintf( stderr, "order%d: %d %d %d\n",
                  i, g->order[i][0], g->order[i][1], g->order[i][2] );
+    }
 #endif
 }
 
 
 void Setup_New_Grid( reax_system* system, control_params* control,
-                     MPI_Comm comm )
+        MPI_Comm comm )
 {
-    int              d, i, j, k;
-    grid            *g;
-    simulation_box  *my_box, *my_ext_box;
+    int d, i, j, k;
+    grid *g;
+    simulation_box *my_box, *my_ext_box;
     boundary_cutoff *bc;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: setup new grid\n", system->my_rank );
 #endif
+
     g = &( system->my_grid );
     my_box = &( system->my_box );
     my_ext_box = &( system->my_ext_box );
@@ -273,8 +303,11 @@ void Setup_New_Grid( reax_system* system, control_params* control,
     for ( d = 0; d < 3; ++d )
     {
         /* estimate the number of native cells */
-        g->native_cells[d] = (int)(my_box->box_norms[d] / (control->vlist_cut / 2));
-        if ( g->native_cells[d] == 0 ) g->native_cells[d] = 1;
+        g->native_cells[d] = (int)(my_box->box_norms[d] / (control->vlist_cut / 2.0));
+        if ( g->native_cells[d] == 0 )
+        {
+            g->native_cells[d] = 1;
+        }
     }
 
     /* cell lengths */
@@ -284,24 +317,28 @@ void Setup_New_Grid( reax_system* system, control_params* control,
     for ( d = 0; d < 3; ++d )
     {
         /* # of surrounding grid cells to look into for nonbonded & bonded nbrs */
-        g->vlist_span[d] = (int)ceil( control->vlist_cut / g->cell_len[d] );
-        g->nonb_span[d] = (int)ceil( control->nonb_cut / g->cell_len[d] );
-        g->bond_span[d] = (int)ceil( control->bond_cut / g->cell_len[d] );
+        g->vlist_span[d] = (int)CEIL( control->vlist_cut / g->cell_len[d] );
+        g->nonb_span[d] = (int)CEIL( control->nonb_cut / g->cell_len[d] );
+        g->bond_span[d] = (int)CEIL( control->bond_cut / g->cell_len[d] );
         /* span of the ghost region in terms of gcells */
-        g->ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff /
-                                     g->cell_len[d]);
-        g->ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb /
-                                          g->cell_len[d]);
-        g->ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                            g->cell_len[d] );
-        g->ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                           g->cell_len[d] );
+        g->ghost_span[d] = (int)CEIL(
+                system->bndry_cuts.ghost_cutoff / g->cell_len[d] );
+        g->ghost_nonb_span[d] = (int)CEIL(
+                system->bndry_cuts.ghost_nonb / g->cell_len[d] );
+        g->ghost_hbond_span[d] = (int)CEIL(
+                system->bndry_cuts.ghost_hbond / g->cell_len[d] );
+        g->ghost_bond_span[d] = (int)CEIL(
+                system->bndry_cuts.ghost_bond / g->cell_len[d] );
     }
 
     /* total number of grid cells */
     ivec_ScaledSum( g->ncells, 1, g->native_cells, 2, g->ghost_span );
     g->total = g->ncells[0] * g->ncells[1] * g->ncells[2];
-    //fprintf (stderr, " dimensions (%d, %d, %d) \n", g->ncells[0], g->ncells[1], g->ncells[2]);
+
+#if defined(DEBUG)
+    fprintf( stderr, " dimensions (%d, %d, %d) \n",
+            g->ncells[0], g->ncells[1], g->ncells[2] );
+#endif
 
     /* native cell start & ends */
     ivec_Copy( g->native_str, g->ghost_span );
@@ -310,26 +347,19 @@ void Setup_New_Grid( reax_system* system, control_params* control,
     /* upper bound on the number of gcells to be exchanged with a single nbr */
     system->gcell_cap =
         MAX3( g->native_cells[0] * g->native_cells[1] * g->ghost_span[2],
-              g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
-              g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
+                g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
+                g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
 
     /* allocate grid space */
     Allocate_Grid( system, comm );
 
     /* compute min and max coords for each grid cell */
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                /*
-                g->cells[i][j][k].min[0] = my_ext_box->min[0] + i * g->cell_len[0];
-                g->cells[i][j][k].min[1] = my_ext_box->min[1] + j * g->cell_len[1];
-                g->cells[i][j][k].min[2] = my_ext_box->min[2] + k * g->cell_len[2];
-
-                g->cells[i][j][k].max[0] = my_ext_box->min[0] + (i+1)*g->cell_len[0];
-                g->cells[i][j][k].max[1] = my_ext_box->min[1] + (j+1)*g->cell_len[1];
-                g->cells[i][j][k].max[2] = my_ext_box->min[2] + (k+1)*g->cell_len[2];
-                */
                 g->cells[ index_grid_3d(i, j, k, g) ].min[0] = my_ext_box->min[0] + i * g->cell_len[0];
                 g->cells[ index_grid_3d(i, j, k, g) ].min[1] = my_ext_box->min[1] + j * g->cell_len[1];
                 g->cells[ index_grid_3d(i, j, k, g) ].min[2] = my_ext_box->min[2] + k * g->cell_len[2];
@@ -338,6 +368,8 @@ void Setup_New_Grid( reax_system* system, control_params* control,
                 g->cells[ index_grid_3d(i, j, k, g) ].max[1] = my_ext_box->min[1] + (j + 1) * g->cell_len[1];
                 g->cells[ index_grid_3d(i, j, k, g) ].max[2] = my_ext_box->min[2] + (k + 1) * g->cell_len[2];
             }
+        }
+    }
 
     /* determine the exchange boundaries with nbrs in terms of gcells */
     Mark_GCells( system, g, control->procs_by_dim, comm );
@@ -349,10 +381,9 @@ void Setup_New_Grid( reax_system* system, control_params* control,
 }
 
 
-
 void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
 {
-    int  d, i, j, k, itr;
+    int d, i, j, k, itr;
     ivec ci, native_cells, nonb_span, bond_span;
     ivec ghost_span, ghost_nonb_span, ghost_bond_span, ghost_hbond_span;;
     rvec cell_len, inv_len;
@@ -372,7 +403,10 @@ void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
     {
         /* estimate the number of native cells */
         native_cells[d] = (int)(my_box->box_norms[d] / (control->vlist_cut / 2));
-        if ( native_cells[d] == 0 ) native_cells[d] = 1;
+        if ( native_cells[d] == 0 )
+        {
+            native_cells[d] = 1;
+        }
     }
 
     /* cell lengths */
@@ -382,20 +416,22 @@ void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
     for ( d = 0; d < 3; ++d )
     {
         /* # of surrounding grid cells to look into for nonbonded & bonded nbrs */
-        nonb_span[d] = (int)ceil( control->nonb_cut / cell_len[d] );
-        bond_span[d] = (int)ceil( control->bond_cut / cell_len[d] );
+        nonb_span[d] = (int)CEIL( control->nonb_cut / cell_len[d] );
+        bond_span[d] = (int)CEIL( control->bond_cut / cell_len[d] );
         /* span of the ghost region in terms of gcells */
-        ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff / cell_len[d]);
-        ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb / cell_len[d]);
-        ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                         cell_len[d] );
-        ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                        cell_len[d] );
+        ghost_span[d] = (int)CEIL(system->bndry_cuts.ghost_cutoff /
+                cell_len[d]);
+        ghost_nonb_span[d] = (int)CEIL(system->bndry_cuts.ghost_nonb /
+                cell_len[d]);
+        ghost_hbond_span[d] = (int)CEIL( system->bndry_cuts.ghost_hbond /
+                cell_len[d] );
+        ghost_bond_span[d] = (int)CEIL( system->bndry_cuts.ghost_bond /
+                cell_len[d] );
     }
 
-
+    /* gcells are unchanged */
     if ( ivec_isEqual( native_cells, g->native_cells ) &&
-            ivec_isEqual( ghost_span, g->ghost_span ) )   // gcells are unchanged
+            ivec_isEqual( ghost_span, g->ghost_span ) )
     {
         /* update cell lengths */
         rvec_Copy( g->cell_len, cell_len );
@@ -403,80 +439,87 @@ void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
 
         /* compute min and max coords for each grid cell */
         for ( i = 0; i < g->ncells[0]; i++ )
+        {
             for ( j = 0; j < g->ncells[1]; j++ )
+            {
                 for ( k = 0; k < g->ncells[2]; k++ )
                 {
-                    /*
-                      g->cells[i][j][k].min[0] = my_ext_box->min[0] + i * g->cell_len[0];
-                      g->cells[i][j][k].min[1] = my_ext_box->min[1] + j * g->cell_len[1];
-                      g->cells[i][j][k].min[2] = my_ext_box->min[2] + k * g->cell_len[2];
-
-                      g->cells[i][j][k].max[0] = my_ext_box->min[0]+(i+1)*g->cell_len[0];
-                      g->cells[i][j][k].max[1] = my_ext_box->min[1]+(j+1)*g->cell_len[1];
-                      g->cells[i][j][k].max[2] = my_ext_box->min[2]+(k+1)*g->cell_len[2];
-                      */
-                    g->cells[ index_grid_3d(i, j, k, g) ].min[0] = my_ext_box->min[0] + i * g->cell_len[0];
-                    g->cells[ index_grid_3d(i, j, k, g) ].min[1] = my_ext_box->min[1] + j * g->cell_len[1];
-                    g->cells[ index_grid_3d(i, j, k, g) ].min[2] = my_ext_box->min[2] + k * g->cell_len[2];
-
-                    g->cells[ index_grid_3d(i, j, k, g) ].max[0] = my_ext_box->min[0] + (i + 1) * g->cell_len[0];
-                    g->cells[ index_grid_3d(i, j, k, g) ].max[1] = my_ext_box->min[1] + (j + 1) * g->cell_len[1];
-                    g->cells[ index_grid_3d(i, j, k, g) ].max[2] = my_ext_box->min[2] + (k + 1) * g->cell_len[2];
+                    g->cells[ index_grid_3d(i, j, k, g) ].min[0] =
+                        my_ext_box->min[0] + i * g->cell_len[0];
+                    g->cells[ index_grid_3d(i, j, k, g) ].min[1] =
+                        my_ext_box->min[1] + j * g->cell_len[1];
+                    g->cells[ index_grid_3d(i, j, k, g) ].min[2] =
+                        my_ext_box->min[2] + k * g->cell_len[2];
+
+                    g->cells[ index_grid_3d(i, j, k, g) ].max[0] =
+                        my_ext_box->min[0] + (i + 1) * g->cell_len[0];
+                    g->cells[ index_grid_3d(i, j, k, g) ].max[1] =
+                        my_ext_box->min[1] + (j + 1) * g->cell_len[1];
+                    g->cells[ index_grid_3d(i, j, k, g) ].max[2] =
+                        my_ext_box->min[2] + (k + 1) * g->cell_len[2];
                 }
+            }
+        }
 
         /* pick up a cell in the grid */
         for ( ci[0] = 0; ci[0] < g->ncells[0]; ci[0]++ )
+        {
             for ( ci[1] = 0; ci[1] < g->ncells[1]; ci[1]++ )
+            {
                 for ( ci[2] = 0; ci[2] < g->ncells[2]; ci[2]++ )
                 {
-                    //SUDHIR
-                    //gc = &(g->cells[ci[0]][ci[1]][ci[2]]);
-                    gc = &(g->cells[ index_grid_3d (ci[0], ci[1], ci[2], g) ]);
+                    gc = &(g->cells[ index_grid_3d(ci[0], ci[1], ci[2], g) ]);
 
                     itr = 0;
                     //while( g->nbrs[itr] != NULL ) {
                     while ( g->nbrs_x[itr][0] >= 0 )
                     {
-                        //Find_Closest_Point( g, ci, gc->nbrs_x[itr], gc->nbrs_cp[itr] );
-                        Find_Closest_Point( g, ci, g->nbrs_x[index_grid_nbrs (ci[0], ci[1], ci[2], itr, g)],
-                                            g->nbrs_cp[index_grid_nbrs (ci[0], ci[1], ci[2], itr, g)] );
+                        Find_Closest_Point( g, ci,
+                                g->nbrs_x[index_grid_nbrs(ci[0], ci[1], ci[2], itr, g)],
+                                g->nbrs_cp[index_grid_nbrs(ci[0], ci[1], ci[2], itr, g)] );
                         ++itr;
                     }
                 }
+            }
+        }
     }
-    else   // the grid has changed!
+    /* the grid has changed! */
+    else
     {
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: whole grid is being updated\n", system->my_rank );
 #endif
+
         Deallocate_Grid( g );
         Setup_New_Grid( system, control, comm );
     }
 }
 
 
-/* bin my atoms into grid cells */
+/* Bin my (native) atoms into grid cells */
 void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
 {
-    int  i, j, k, l, d, max_atoms;
+    int i, j, k, l, d, max_atoms;
     ivec c;
     simulation_box *big_box, *my_box, *my_ext_box;
-    grid  *g;
+    grid *g;
     grid_cell *gc;
     reax_atom *atoms;
 
-    big_box    = &(system->big_box);
+    big_box = &(system->big_box);
     my_ext_box = &(system->my_ext_box);
-    my_box     = &(system->my_box);
-    g          = &(system->my_grid);
-    atoms      = system->my_atoms;
+    my_box = &(system->my_box);
+    g = &(system->my_grid);
+    atoms = system->my_atoms;
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d bin_my_atoms: entered\n", system->my_rank );
 #endif
+
     Reset_Grid( g );
 
     for ( l = 0; l < system->n; l++ )
+    {
         // outgoing atoms are marked with orig_id = -1
         if ( atoms[l].orig_id >= 0 )
         {
@@ -489,33 +532,38 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
                 if ( atoms[l].x[d] < my_box->min[d] || atoms[l].x[d] > my_box->max[d] )
                 {
                     fprintf( stderr, "p%d: local atom%d [%f %f %f] is out of my box!\n",
-                             system->my_rank, l,
-                             atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
+                            system->my_rank, l,
+                            atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
                     fprintf( stderr, "p%d: my_box=[%f-%f, %f-%f, %f-%f]\n",
-                             system->my_rank, my_box->min[0], my_box->max[0],
-                             my_box->min[1], my_box->max[1],
-                             my_box->min[2], my_box->max[2] );
-                    MPI_Abort( MPI_COMM_WORLD, -1 );
+                            system->my_rank, my_box->min[0], my_box->max[0],
+                            my_box->min[1], my_box->max[1],
+                            my_box->min[2], my_box->max[2] );
+                    MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
                 }
 
                 c[d] = (int)((atoms[l].x[d] - my_ext_box->min[d]) * g->inv_len[d]);
                 if ( c[d] >= g->native_end[d] )
+                {
                     c[d] = g->native_end[d] - 1;
+                }
                 else if ( c[d] < g->native_str[d] )
+                {
                     c[d] = g->native_str[d];
+                }
             }
+
 #if defined(DEBUG)
             fprintf( stderr, "p%d bin_my_atoms: l:%d - atom%d @ %.5f %.5f %.5f"\
-                     "--> cell: %d %d %d\n",
-                     system->my_rank, l, atoms[l].orig_id,
-                     atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
-                     c[0], c[1], c[2] );
+                    "--> cell: %d %d %d\n",
+                    system->my_rank, l, atoms[l].orig_id,
+                    atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
+                    c[0], c[1], c[2] );
 #endif
-            //SUDHIR
-            //gc = &( g->cells[c[0]][c[1]][c[2]] );
+
             gc = &( g->cells[ index_grid_3d(c[0], c[1], c[2], g) ] );
             gc->atoms[ gc->top++ ] = l;
         }
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d bin_my_atoms: sorted atoms\n", system->my_rank );
@@ -523,26 +571,31 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
 
     max_atoms = 0;
     for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
+    {
         for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
+        {
             for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
             {
-                //SUDHIR
-                //gc = &(g->cells[i][j][k]);
                 gc = &(g->cells[ index_grid_3d(i, j, k, g) ]);
                 if ( max_atoms < gc->top )
+                {
                     max_atoms = gc->top;
+                }
+
 #if defined(DEBUG)
                 fprintf( stderr, "p%d gc[%d,%d,%d]->top=%d\n",
                          system->my_rank, i, j, k, gc->top );
 #endif
             }
+        }
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d max_atoms=%d, g->max_atoms=%d\n",
-             system->my_rank, max_atoms, g->max_atoms );
+            system->my_rank, max_atoms, g->max_atoms );
 #endif
+
     /* check if current gcell->max_atoms is safe */
-    //fprintf (stderr, "*********** grid max_atoms: %d \n", g->max_atoms );
     if ( max_atoms >= g->max_atoms * DANGER_ZONE )
     {
         realloc->gcell_atoms = MAX( max_atoms * SAFE_ZONE, MIN_GCELL_POPL );
@@ -560,40 +613,32 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
 }
 
 
-
-/* reorder atoms falling into the same gcell together in the atom list */
+/* Reorder atoms falling into the same gcell together in the atom list */
 void Reorder_My_Atoms( reax_system *system, storage *workspace )
 {
-    int        i, l, x, y, z;
-    int        top, old_id;
-    grid      *g;
+    int i, l, x, y, z;
+    int top, old_id;
+    grid *g;
     grid_cell *gc;
     reax_atom *old_atom, *new_atoms;
 
     /* allocate storage space for est_N */
-    new_atoms = (reax_atom*) malloc( system->total_cap * sizeof(reax_atom) );
+    new_atoms = (reax_atom*) smalloc( system->total_cap * sizeof(reax_atom), "new_atoms" );
     top = 0;
     g = &( system->my_grid );
 
-    //for( i = 0; i < g->ncells[0]; i++ )
-    //for( j = 0; j < g->ncells[1]; j++ )
-    //  for( k = 0; k < g->ncells[2]; k++ ) {
     for ( i = 0; i < g->total; ++i )
     {
         x = g->order[i][0];
         y = g->order[i][1];
         z = g->order[i][2];
-        //SUDHIR
-        //gc = &( g->cells[x][y][z] );
-        gc = &( g->cells[ index_grid_3d (x, y, z, g) ] );
+        gc = &( g->cells[ index_grid_3d(x, y, z, g) ] );
         g->str[index_grid_3d(x, y, z, g)] = top;
 
         for ( l = 0; l < gc->top; ++l )
         {
-            old_id   = gc->atoms[l];
+            old_id = gc->atoms[l];
             old_atom = &( system->my_atoms[old_id] );
-            //fprintf( stderr, "%d <-- %d\n", top, old_id );
-            //reax_atom_Copy( &(new_atoms[top]), old_atom );
             memcpy( new_atoms + top, old_atom, sizeof(reax_atom) );
             new_atoms[top].imprt_id = -1;
             ++top;
@@ -602,16 +647,55 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace )
     }
 
     /* deallocate old storage */
-    free( system->my_atoms );
+    sfree( system->my_atoms, "system->my_atoms" );
+
     /* start using clustered storages */
     system->my_atoms = new_atoms;
     system->n = top;
     system->N = system->n;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: g->total = %d\n", 
+            system->my_rank, g->total );
+    fprintf( stderr, "p%d: g->ncells[0] = %d, g->ncells[1] = %d, g->ncells[2] = %d\n", 
+            system->my_rank, g->ncells[0], g->ncells[1], g->ncells[2] );
+    fflush( stderr );
+    for ( i = 0; i < g->total; ++i )
+    {
+        x = g->order[i][0];
+        y = g->order[i][1];
+        z = g->order[i][2];
+        gc = &( g->cells[ index_grid_3d(x, y, z, g) ] );
+
+        fprintf( stderr, "p%d: x = %6d, y = %6d, z = %6d\n",
+                system->my_rank, x, y, z );
+        fprintf( stderr, "p%d: index_grid_3d = %d\n",
+                system->my_rank, index_grid_3d(x, y, z, g) );
+        fprintf( stderr, "p%d: i = %6d, g->start[%6d] = %6d, g->end[%6d] = %6d\n",
+                system->my_rank, i, i, g->str[index_grid_3d(x, y, z, g)],
+                i, g->end[index_grid_3d(x, y, z, g)] );
+        fflush( stderr );
+        for ( l = g->str[index_grid_3d(x, y, z, g)]; l < g->end[index_grid_3d(x, y, z, g)]; ++l )
+        {
+            fprintf( stderr, "p%d: atom %6d: x = %10.4f, y = %10.4f, z = %10.4f\n",
+                    system->my_rank, system->my_atoms[l].orig_id,
+                    system->my_atoms[l].x[0],
+                    system->my_atoms[l].x[1],
+                    system->my_atoms[l].x[2] );
+            fflush( stderr );
+        }
+    }
+
+    fprintf( stderr, "p%d: DONE REORDERING BINNED ATOMS\n",
+            system->my_rank );
+    fflush( stderr );
+#endif
 }
 
 
+/* Determine the grid cell which a boundary atom falls within */
 void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
-                         rvec *cur_min, rvec *cur_max, ivec gcell_cood )
+        rvec *cur_min, rvec *cur_max, ivec gcell_cood )
 {
     int d;
     ivec c;
@@ -620,50 +704,62 @@ void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
     for ( d = 0; d < 3; ++d )
     {
         c[d] = (int)((x[d] - base[d]) * g->inv_len[d]);
-        if ( c[d] < 0 ) c[d] = 0;
+        if ( c[d] < 0 )
+        {
+            c[d] = 0;
+        }
         //else if( c[d] == g->native_str[d] ) --c[d];
         //else if( c[d] == g->native_end[d] - 1 ) ++c[d];
-        else if ( c[d] >= g->ncells[d] ) c[d] = g->ncells[d] - 1;
+        else if ( c[d] >= g->ncells[d] )
+        {
+            c[d] = g->ncells[d] - 1;
+        }
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: base=[%f %f %f] x=[%f %f %f] c=[%d %d %d]\n",
-             base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
+            base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
 #endif
 
-    ivec_Copy (gcell_cood, c);
+    ivec_Copy( gcell_cood, c );
 
-    //SUDHIR
-    //*gc = &( g->cells[c[0]][c[1]][c[2]] );
     *gc = &( g->cells[ index_grid_3d(c[0], c[1], c[2], g) ] );
-    rvec_ScaledSum( *cur_min, 1, (*gc)->min, -1, loosen );
+    rvec_ScaledSum( *cur_min, 1.0, (*gc)->min, -1.0, loosen );
     rvec_Sum( *cur_max, (*gc)->max, loosen );
+
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: gcmin=[%f %f %f] gcmax=[%f %f %f]\n",
-             (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
-             (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
+            (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
+            (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
     fprintf( stderr, "get_bndry_gc: curmin=[%f %f %f] curmax=[%f %f %f]\n",
-             (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
-             (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
+            (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
+            (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
 #endif
 }
 
 
+/* Check if the current atom position falls within the
+ * boundaries of a grid cell */
 int is_Within_GCell( rvec x, rvec cur_min, rvec cur_max )
 {
     int d;
 
     for ( d = 0; d < 3; ++d )
+    {
         if ( x[d] < cur_min[d] || x[d] > cur_max[d] )
-            return 0;
+        {
+            return FALSE;
+        }
+    }
 
-    return 1;
+    return TRUE;
 }
 
 
-/* bin my atoms into grid cells */
+/* bin my boundary atoms into grid cells */
 void Bin_Boundary_Atoms( reax_system *system )
 {
-    int  i, start, end;
+    int i, start, end;
     rvec base, cur_min, cur_max;
     grid *g;
     grid_cell *gc;
@@ -672,50 +768,55 @@ void Bin_Boundary_Atoms( reax_system *system )
     ivec gcell_cood;
 
 #if defined(DEBUG)
-    fprintf( stderr, "p%d bin_boundary_atoms: entered with start: %d, end: %d\n", system->my_rank, system->n, system->N );
+    fprintf( stderr, "p%d bin_boundary_atoms: entered with start: %d, end: %d\n",
+            system->my_rank, system->n, system->N );
 #endif
+
     g = &(system->my_grid);
     atoms = system->my_atoms;
     start = system->n;
     end = system->N;
     if ( start == end )
+    {
         return;
+    }
 
     ext_box = &(system->my_ext_box);
     memcpy( base, ext_box->min, sizeof(rvec) );
 
     Get_Boundary_GCell( g, base, atoms[start].x, &gc, &cur_min, &cur_max, gcell_cood );
-    g->str[index_grid_3d (gcell_cood[0], gcell_cood[1], gcell_cood[2], g)] = start;
+    g->str[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = start;
     gc->top = 1;
+
     /* error check */
-    if ( !is_Within_GCell( atoms[start].x, ext_box->min, ext_box->max ) )
+    if ( is_Within_GCell( atoms[start].x, ext_box->min, ext_box->max ) == FALSE )
     {
-        fprintf( stderr, "p%d: (start):ghost atom%d [%f %f %f] is out of my box!\n",
+        fprintf( stderr, "p%d: (start):ghost atom%d [%f %f %f] is out of my (grid cell) box!\n",
                  system->my_rank, start,
                  atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
-        //MPI_Abort( MPI_COMM_WORLD, -1 );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
     }
 
     for ( i = start + 1; i < end; i++ )
     {
         /* error check */
-        //if(atoms[i].x[0]<ext_box->min[0] || atoms[i].x[0]>ext_box->max[0] ||
-        // atoms[i].x[1]<ext_box->min[1] || atoms[i].x[1]>ext_box->max[1] ||
-        // atoms[i].x[2]<ext_box->min[2] || atoms[i].x[2]>ext_box->max[2] ){
-        if ( !is_Within_GCell( atoms[i].x, ext_box->min, ext_box->max ) )
+        if ( is_Within_GCell( atoms[i].x, ext_box->min, ext_box->max ) == FALSE )
         {
-            fprintf( stderr, "p%d: (middle )ghost atom%d [%f %f %f] is out of my box!\n",
-                     system->my_rank, i,
-                     atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
-            //MPI_Abort( MPI_COMM_WORLD, -1 );
+            fprintf( stderr, "p%d: (middle) ghost atom%d [%f %f %f] is out of my (grid cell) box!\n",
+                    system->my_rank, i,
+                    atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
         }
 
-        if ( is_Within_GCell( atoms[i].x, cur_min, cur_max ) )
+        if ( is_Within_GCell( atoms[i].x, cur_min, cur_max ) == TRUE )
+        {
             ++gc->top;
+        }
         else
         {
-            g->end[index_grid_3d (gcell_cood[0], gcell_cood[1], gcell_cood[2], g)] = i;
+            g->end[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = i;
             Get_Boundary_GCell( g, base, atoms[i].x, &gc, &cur_min, &cur_max, gcell_cood );
+
             /* sanity check! */
             if ( gc->top != 0 )
             {
@@ -725,19 +826,18 @@ void Bin_Boundary_Atoms( reax_system *system )
                          atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
                          gc->min[0], gc->min[1], gc->min[2],
                          gc->max[0], gc->max[1], gc->max[2] );
-                //MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
-            g->str[index_grid_3d (gcell_cood[0], gcell_cood[1], gcell_cood[2], g)] = i;
+
+            g->str[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = i;
             gc->top = 1;
         }
     }
 
     /* mark last gcell's end position */
-    g->end[index_grid_3d (gcell_cood[0], gcell_cood[1], gcell_cood[2], g)] = i;
+    g->end[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = i;
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d bin_boundary_atoms: done\n", system->my_rank );
 #endif
-
-    //MPI_Abort( MPI_COMM_WORLD, -1 );
 }
diff --git a/PG-PuReMD/src/grid.h b/PG-PuReMD/src/grid.h
index ad51e699182a9b49e1de6b488605ec4e48179cd8..cb124da7b902a1d62864f2a801782875644cb73b 100644
--- a/PG-PuReMD/src/grid.h
+++ b/PG-PuReMD/src/grid.h
@@ -24,10 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Setup_New_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Update_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Bin_My_Atoms( reax_system*, reallocate_data* );
+
 void Reorder_My_Atoms( reax_system*, storage* );
+
 void Bin_Boundary_Atoms( reax_system* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/hydrogen_bonds.c b/PG-PuReMD/src/hydrogen_bonds.c
index 493379ce7d4c2f903ef1bb51c1914765bf1c9a67..dfd7abac747a8c7e78d48d47a63aa9600af1d379 100644
--- a/PG-PuReMD/src/hydrogen_bonds.c
+++ b/PG-PuReMD/src/hydrogen_bonds.c
@@ -20,41 +20,41 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "hydrogen_bonds.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "valence_angles.h"
-#include "vector.h"
+  #include "hydrogen_bonds.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "valence_angles.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_hydrogen_bonds.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_valence_angles.h"
-#include "reax_vector.h"
+  #include "reax_hydrogen_bonds.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_valence_angles.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
+
 // DANIEL
 // This function is taken straight from PuReMD, with minimal changes to accomodate the new datastructures
 // Attempting to fix ehb being way off in MPI_Not_GPU
-
 void Hydrogen_Bonds( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-    int  i, j, k, pi, pk;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, hb_start_j, hb_end_j;
-    int  hblist[MAX_BONDS];
-    int  itr, top;
-    int  num_hb_intrs = 0;
+    int i, j, k, pi, pk;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int itr, top;
     ivec rel_jk;
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     bond_data *pbond_ij;
@@ -62,6 +62,9 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
     reax_list *bonds, *hbonds;
     bond_data *bond_list;
     hbond_data *hbond_list;
+#if defined(DEBUG)
+    int num_hb_intrs = 0;
+#endif
     
     bonds = (*lists) + BONDS;
     bond_list = bonds->select.bond_list;
@@ -69,22 +72,24 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
     hbond_list = hbonds->select.hbond_list;
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
-       here j is H atom and there has to be some bond between i and j.
-       Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map
-       variables onto the ones in the handout.*/
+     * here j is H atom and there has to be some bond between i and j.
+     * Hydrogen bond is between j and k.
+     * so in this function i->X, j->H, k->Z when we map
+     * variables onto the ones in the handout.*/
     for ( j = 0; j < system->n; ++j )
+    {
         /* j has to be of type H */
-        if ( system->reax_param.sbp[system->my_atoms[j].type].p_hbond == 1 )
+        if ( system->reax_param.sbp[system->my_atoms[j].type].p_hbond == H_ATOM )
         {
-            /*set j's variables */
-            type_j     = system->my_atoms[j].type;
-            start_j    = Start_Index(j, bonds);
-            end_j      = End_Index(j, bonds);
+            /* set j's variables */
+            type_j = system->my_atoms[j].type;
+            start_j = Start_Index( j, bonds );
+            end_j = End_Index( j, bonds );
             hb_start_j = Start_Index( system->my_atoms[j].Hindex, hbonds );
-            hb_end_j   = End_Index( system->my_atoms[j].Hindex, hbonds );
+            hb_end_j = End_Index( system->my_atoms[j].Hindex, hbonds );
 
             top = 0;
+            /* search bonded atoms to atom j (i.e., hydrogen atom) for potential hydrogen bonding */
             for ( pi = start_j; pi < end_j; ++pi )
             {
                 pbond_ij = &( bond_list[pi] );
@@ -92,14 +97,17 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 bo_ij = &(pbond_ij->bo_data);
                 type_i = system->my_atoms[i].type;
 
-                if ( system->reax_param.sbp[type_i].p_hbond == 2 &&
+                if ( system->reax_param.sbp[type_i].p_hbond == H_BONDING_ATOM &&
                         bo_ij->BO >= HB_THRESHOLD )
+                {
                     hblist[top++] = pi;
+                }
             }
 
             // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
             //          j, top, hb_start_j, hb_end_j );
 
+            /* for each hbond of atom j */
             for ( pk = hb_start_j; pk < hb_end_j; ++pk )
             {
                 /* set k's varibles */
@@ -109,11 +117,12 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 r_jk = nbr_jk->d;
                 rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
 
+                /* find matching hbond to atom k */
                 for ( itr = 0; itr < top; ++itr )
                 {
                     pi = hblist[itr];
-		// DANIEL
-                //    pbond_ij = &( bonds->bond_list[pi] );
+                    //DANIEL
+                    //pbond_ij = &( bonds->bond_list[pi] );
                     pbond_ij = &( bonds->select.bond_list[pi] );
                     i = pbond_ij->nbr;
 
@@ -122,35 +131,34 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                         bo_ij = &(pbond_ij->bo_data);
                         type_i = system->my_atoms[i].type;
                         r_ij = pbond_ij->d;
-                        //hbp = &(system->reax_param.hbp[ type_i ][ type_j ][ type_k ]);
-			// SUDHIR
-			hbp = &(system->reax_param.hbp[ index_hbp (type_i, type_j, type_k, system->reax_param.num_atom_types) ]);
+			hbp = &(system->reax_param.hbp[ index_hbp(type_i, type_j, type_k, system->reax_param.num_atom_types) ]);
 
+#if defined(DEBUG)
                         ++num_hb_intrs;
+#endif
 
                         Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                         &theta, &cos_theta );
+                                &theta, &cos_theta );
                         /* the derivative of cos(theta) */
                         Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                              &dcos_theta_di, &dcos_theta_dj,
-                                              &dcos_theta_dk );
+                                &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
 
-                        /* hyrogen bond energy*/
-                        sin_theta2 = sin( theta / 2.0 );
+                        /* hyrogen bond energy */
+                        sin_theta2 = SIN( theta / 2.0 );
                         sin_xhz4 = SQR(sin_theta2);
                         sin_xhz4 *= sin_xhz4;
                         cos_xhz1 = ( 1.0 - cos_theta );
-                        exp_hb2 = exp( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = exp( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
-                                                       r_jk / hbp->r0_hb - 2.0 ) );
+                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
+                                    r_jk / hbp->r0_hb - 2.0 ) );
 
-                        data->my_en.e_hb += e_hb =
-                                                hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        data->my_en.e_hb += e_hb;
 
                         CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
                         CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
                         CEhb3 = -hbp->p_hb3 *
-                                (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
+                            (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
 
                         /*fprintf( stdout,
                           "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
@@ -175,7 +183,7 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                         else
                         {
                             /* for pressure coupling, terms that are not related to bond order
-                            derivatives are added directly into pressure vector/tensor */
+                             * derivatives are added directly into pressure vector/tensor */
                             rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
                             rvec_Add( workspace->f[i], force );
                             rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
@@ -212,6 +220,7 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                                  system->my_atoms[k].orig_id,
                                  r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );
 #endif
+
 #ifdef TEST_FORCES
                         Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term
                         // dcos terms
@@ -226,31 +235,31 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 }
             }
         }
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
     fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->my_en.e_hb );
     fprintf( stderr, "hydbonds: ext_press (%24.15e %24.15e %24.15e)\n",
-             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
                                                                                                                               
+
 void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control )
 {
     int  i, j, k, pi, pk;
     int  type_i, type_j, type_k;
     int  start_j, end_j, hb_start_j, hb_end_j;
     int  hblist[MAX_BONDS];
     int  itr, top;
-    int  num_hb_intrs = 0;
     ivec rel_jk;
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     bond_data *pbond_ij;
@@ -258,6 +267,9 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
     reax_list *bonds, *hbonds;
     bond_data *bond_list;
     hbond_data *hbond_list;
+#if defined(DEBUG)
+    int num_hb_intrs = 0;
+#endif
 
     bonds = (*lists) + BONDS;
     bond_list = bonds->select.bond_list;
@@ -265,20 +277,21 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
     hbond_list = hbonds->select.hbond_list;
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
-       here j is H atom and there has to be some bond between i and j.
-       Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map
-       variables onto the ones in the handout.*/
+     * here j is H atom and there has to be some bond between i and j.
+     * Hydrogen bond is between j and k.
+     * so in this function i->X, j->H, k->Z when we map
+     * variables onto the ones in the handout.*/
     for ( j = 0; j < system->n; ++j )
+    {
         /* j has to be of type H */
-        if ( system->reax_param.sbp[system->my_atoms[j].type].p_hbond == 1 )
+        if ( system->reax_param.sbp[system->my_atoms[j].type].p_hbond == H_ATOM )
         {
             /*set j's variables */
-            type_j     = system->my_atoms[j].type;
-            start_j    = Start_Index(j, bonds);
-            end_j      = End_Index(j, bonds);
+            type_j = system->my_atoms[j].type;
+            start_j = Start_Index(j, bonds);
+            end_j = End_Index(j, bonds);
             hb_start_j = Start_Index( system->my_atoms[j].Hindex, hbonds );
-            hb_end_j   = End_Index( system->my_atoms[j].Hindex, hbonds );
+            hb_end_j = End_Index( system->my_atoms[j].Hindex, hbonds );
 
             top = 0;
             for ( pi = start_j; pi < end_j; ++pi )
@@ -288,9 +301,11 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
                 bo_ij = &(pbond_ij->bo_data);
                 type_i = system->my_atoms[i].type;
 
-                if ( system->reax_param.sbp[type_i].p_hbond == 2 &&
+                if ( system->reax_param.sbp[type_i].p_hbond == H_BONDING_ATOM &&
                         bo_ij->BO >= HB_THRESHOLD )
+                {
                     hblist[top++] = pi;
+                }
             }
 
             // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
@@ -318,33 +333,34 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
                         r_ij = pbond_ij->d;
                         //SUDHIR
                         //hbp = &(system->reax_param.hbp[ type_i ][ type_j ][ type_k ]);
-                        hbp = &(system->reax_param.hbp[ index_hbp (type_i, type_j, type_k, system->reax_param.num_atom_types) ]);
+                        hbp = &(system->reax_param.hbp[ index_hbp(type_i, type_j, type_k, system->reax_param.num_atom_types) ]);
+
+#if defined(DEBUG)
                         ++num_hb_intrs;
+#endif
 
                         Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                         &theta, &cos_theta );
+                                &theta, &cos_theta );
                         /* the derivative of cos(theta) */
                         Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                              &dcos_theta_di, &dcos_theta_dj,
-                                              &dcos_theta_dk );
+                                &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
 
-                        /* hyrogen bond energy*/
+                        /* hyrogen bond energy */
                         sin_theta2 = SIN( theta / 2.0 );
                         sin_xhz4 = SQR(sin_theta2);
                         sin_xhz4 *= sin_xhz4;
                         cos_xhz1 = ( 1.0 - cos_theta );
                         exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
                         exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
-                                                       r_jk / hbp->r0_hb - 2.0 ) );
+                                    r_jk / hbp->r0_hb - 2.0 ) );
 
-                        //data->my_en.e_hb +=
                         e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                        data->my_en.e_hb += 1;
+                        data->my_en.e_hb += e_hb;
 
                         CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
                         CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
                         CEhb3 = -hbp->p_hb3 *
-                                (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
+                            (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
 
                         /*fprintf( stdout,
                           "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
@@ -406,6 +422,7 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
                                  system->my_atoms[k].orig_id,
                                  r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );
 #endif
+
 #ifdef TEST_FORCES
                         Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term
                         // dcos terms
@@ -420,11 +437,12 @@ void Old_Hydrogen_Bonds( reax_system *system, control_params *control,
                 }
             }
         }
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
     fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->my_en.e_hb );
     fprintf( stderr, "hydbonds: ext_press (%24.15e %24.15e %24.15e)\n",
-             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
diff --git a/PG-PuReMD/src/hydrogen_bonds.h b/PG-PuReMD/src/hydrogen_bonds.h
index 346f00453f10ccf7173db07040a15461162e1781..e4f58e104d6646fddc35ef51c8c169c172f07eca 100644
--- a/PG-PuReMD/src/hydrogen_bonds.h
+++ b/PG-PuReMD/src/hydrogen_bonds.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/index_utils.h b/PG-PuReMD/src/index_utils.h
index 8c6b09618cbce7f0491bff09a5cb55cb9b9c4ba6..337e184490d87badc537281a7b332faccb1b92a5 100644
--- a/PG-PuReMD/src/index_utils.h
+++ b/PG-PuReMD/src/index_utils.h
@@ -4,67 +4,65 @@
 #include "reax_types.h"
 
 
-extern inline CUDA_HOST_DEVICE int index_grid_3d (int i, int j, int k, grid *g)
+/* Indexing routine for grid cells */
+static inline CUDA_HOST_DEVICE int index_grid_3d( int i, int j, int k, grid *g )
 {
-    return  (i * g->ncells[1] * g->ncells[2]) +
-            (j * g->ncells[2]) +
-            k;
+    return (i * g->ncells[1] * g->ncells[2]) + (j * g->ncells[2]) + k;
 }
 
-extern inline CUDA_HOST_DEVICE int index_grid_nbrs (int i, int j, int k, int l, grid *g)
+/* Indexing routine for neighbors of binned atoms within grid cells */
+static inline CUDA_HOST_DEVICE int index_grid_nbrs( int i, int j, int k, int l, grid *g )
 {
-    return  (i * g->ncells[1] * g->ncells[2] * g->max_nbrs) +
-            (j * g->ncells[2] * g->max_nbrs) +
-            (k * g->max_nbrs) +
-            l;
+    return (i * g->ncells[1] * g->ncells[2] * g->max_nbrs) +
+        (j * g->ncells[2] * g->max_nbrs) + (k * g->max_nbrs) + l;
 }
 
-extern inline CUDA_HOST_DEVICE int index_grid_atoms (int i, int j, int k, int l, grid *g)
+/* Indexing routine for binned atoms within grid cells */
+static inline CUDA_HOST_DEVICE int index_grid_atoms( int i, int j, int k, int l, grid *g )
 {
-    return  (i * g->ncells[1] * g->ncells[2] * g->max_atoms) +
-            (j * g->ncells[2] * g->max_atoms) +
-            (k * g->max_atoms) +
-            l;
+    return (i * g->ncells[1] * g->ncells[2] * g->max_atoms) +
+        (j * g->ncells[2] * g->max_atoms) + (k * g->max_atoms) + l;
 }
 
-extern inline CUDA_HOST_DEVICE int index_wkspace_sys (int i, int j, int N)
+/* Indexing routine for workspace system structures */
+static inline CUDA_HOST_DEVICE int index_wkspace_sys( int i, int j, int N )
 {
     return (i * N) + j;
 }
 
-extern inline CUDA_HOST_DEVICE int index_wkspace_res (int i, int j )
+/* Indexing routine for workspace res structures */
+static inline CUDA_HOST_DEVICE int index_wkspace_res( int i, int j )
 {
     return (i * (RESTART + 1)) + j;
 }
 
-extern inline CUDA_HOST_DEVICE int index_tbp (int i, int j, int num_atom_types)
+/* Indexing routine for two body parameters */
+static inline CUDA_HOST_DEVICE int index_tbp( int i, int j, int num_atom_types )
 {
     return (i * num_atom_types) + j;
 }
 
-extern inline CUDA_HOST_DEVICE int index_thbp (int i, int j, int k, int num_atom_types)
+/* Indexing routine for three body parameters */
+static inline CUDA_HOST_DEVICE int index_thbp( int i, int j, int k, int num_atom_types )
 {
-    return  (i * num_atom_types * num_atom_types ) +
-            (j * num_atom_types ) +
-            k;
+    return (i * num_atom_types * num_atom_types) + (j * num_atom_types) + k;
 }
 
-extern inline CUDA_HOST_DEVICE int index_hbp (int i, int j, int k, int num_atom_types)
+/* Indexing routine for hydrogen bonding parameters */
+static inline CUDA_HOST_DEVICE int index_hbp( int i, int j, int k, int num_atom_types )
 {
-    return  (i * num_atom_types * num_atom_types ) +
-            (j * num_atom_types ) +
-            k;
+    return (i * num_atom_types * num_atom_types) + (j * num_atom_types) + k;
 }
 
-extern inline CUDA_HOST_DEVICE int index_fbp (int i, int j, int k, int l, int num_atom_types)
+/* Indexing routine for four body parameters */
+static inline CUDA_HOST_DEVICE int index_fbp( int i, int j, int k, int l, int num_atom_types )
 {
-    return  (i * num_atom_types * num_atom_types * num_atom_types ) +
-            (j * num_atom_types * num_atom_types ) +
-            (k * num_atom_types ) +
-            l;
+    return (i * num_atom_types * num_atom_types * num_atom_types) +
+        (j * num_atom_types * num_atom_types) + (k * num_atom_types) + l;
 }
 
-extern inline CUDA_HOST_DEVICE int index_lr (int i, int j, int num_atom_types )
+/* Indexing routine for LR table (force tabulation) */
+static inline CUDA_HOST_DEVICE int index_lr( int i, int j, int num_atom_types )
 {
     return (i * num_atom_types) + j;
 }
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index f2df3ef7edc17532aa989847fd69841829149212..d071f0dd3e5175ca55bc00d36a266344b41bd059 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -21,52 +21,46 @@
 
 #include "reax_types.h"
 
-#ifdef HAVE_CUDA
-#include "dev_alloc.h"
-#include "dev_list.h"
-#include "cuda_copy.h"
-#include "validation.h"
-#endif
+#include <stddef.h>
 
 #if defined(PURE_REAX)
-#include "init_md.h"
-#include "allocate.h"
-#include "box.h"
-#include "comm_tools.h"
-#include "forces.h"
-#include "grid.h"
-#include "integrate.h"
-#include "io_tools.h"
-#include "list.h"
-#include "lookup.h"
-#include "neighbors.h"
-#include "random.h"
-#include "reset_tools.h"
-#include "system_props.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "init_md.h"
+  #include "allocate.h"
+  #include "box.h"
+  #include "comm_tools.h"
+  #include "forces.h"
+  #include "grid.h"
+  #include "integrate.h"
+  #include "io_tools.h"
+  #include "list.h"
+  #include "lookup.h"
+  #include "neighbors.h"
+  #include "random.h"
+  #include "reset_tools.h"
+  #include "system_props.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_init_md.h"
-#include "reax_allocate.h"
-#include "reax_forces.h"
-#include "reax_io_tools.h"
-#include "reax_list.h"
-#include "reax_lookup.h"
-#include "reax_reset_tools.h"
-#include "reax_system_props.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_init_md.h"
+  #include "reax_allocate.h"
+  #include "reax_forces.h"
+  #include "reax_io_tools.h"
+  #include "reax_list.h"
+  #include "reax_lookup.h"
+  #include "reax_reset_tools.h"
+  #include "reax_system_props.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
 
 #if defined(PURE_REAX)
 /************************ initialize system ************************/
 int Reposition_Atoms( reax_system *system, control_params *control,
-                      simulation_data *data, mpi_datatypes *mpi_data,
-                      char *msg )
+        simulation_data *data, mpi_datatypes *mpi_data, char *msg )
 {
-    int   i;
-    rvec  dx;
+    int i;
+    rvec dx;
 
     /* reposition atoms */
     if ( control->reposition_atoms == 0 )  //fit atoms to periodic box
@@ -102,15 +96,16 @@ void Generate_Initial_Velocities( reax_system *system, real T )
     int i;
     real m, scale, norm;
 
-
     if ( T <= 0.1 )
     {
         for ( i = 0; i < system->n; i++ )
+        {
             rvec_MakeZero( system->my_atoms[i].v );
+        }
     }
     else
     {
-        Randomize();
+        Randomize( );
 
         for ( i = 0; i < system->n; i++ )
         {
@@ -122,56 +117,68 @@ void Generate_Initial_Velocities( reax_system *system, real T )
 
             rvec_Scale( system->my_atoms[i].v, 1. / scale, system->my_atoms[i].v );
 
-            // fprintf( stderr, "v = %f %f %f\n",
-            // system->my_atoms[i].v[0],
-            // system->my_atoms[i].v[1],
-            // system->my_atoms[i].v[2] );
-
-            // fprintf( stderr, "scale = %f\n", scale );
-            // fprintf( stderr, "v = %f %f %f\n",
-            // system->my_atoms[i].v[0],
-            // system->my_atoms[i].v[1],
-            // system->my_atoms[i].v[2] );
+//            fprintf( stderr, "v = %f %f %f\n",
+//                    system->my_atoms[i].v[0],
+//                    system->my_atoms[i].v[1],
+//                    system->my_atoms[i].v[2] );
+//
+//            fprintf( stderr, "scale = %f\n", scale );
+//            fprintf( stderr, "v = %f %f %f\n",
+//                    system->my_atoms[i].v[0],
+//                    system->my_atoms[i].v[1],
+//                    system->my_atoms[i].v[2] );
         }
     }
 }
 
 
 int Init_System( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i;
     reax_atom *atom;
     int nrecv[MAX_NBRS];
 
     Setup_New_Grid( system, control, MPI_COMM_WORLD );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d GRID:\n", system->my_rank );
     Print_Grid( &(system->my_grid), stderr );
 #endif
+
     Bin_My_Atoms( system, &(workspace->realloc) );
     Reorder_My_Atoms( system, workspace );
 
     /* estimate N and total capacity */
-    for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = 0;
+    for ( i = 0; i < MAX_NBRS; ++i )
+    {
+        nrecv[i] = 0;
+    }
     MPI_Barrier( MPI_COMM_WORLD );
     system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-                          Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
     system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
     Bin_Boundary_Atoms( system );
 
     /* estimate numH and Hcap */
-
     system->numH = 0;
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
+    {
         for ( i = 0; i < system->n; ++i )
         {
             atom = &(system->my_atoms[i]);
-            if ( system->reax_param.sbp[ atom->type ].p_hbond == 1 )
+
+            if ( system->reax_param.sbp[ atom->type ].p_hbond == H_ATOM )
+            {
                 atom->Hindex = system->numH++;
-            else atom->Hindex = -1;
+            }
+            else
+            {
+                atom->Hindex = -1;
+            }
         }
+    }
     //Tried fix
     //system->Hcap = MAX( system->numH * SAFER_ZONE, MIN_CAP );
     system->Hcap = MAX( system->n * SAFER_ZONE, MIN_CAP );
@@ -179,20 +186,21 @@ int Init_System( reax_system *system, control_params *control,
 // Sudhir-style below
 /*
     system->numH = 0;
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
         for ( i = 0; i < system->n; ++i )
         {
             atom = &(system->my_atoms[i]);
-            if ( system->reax_param.sbp[ atom->type ].p_hbond == 1 )
+            if ( system->reax_param.sbp[ atom->type ].p_hbond == H_ATOM )
                 atom->Hindex = system->numH++;
             else atom->Hindex = -1;
         }
     system->Hcap = MAX( system->numH * SAFER_ZONE, MIN_CAP );
 */
 
-//Sync_System (system);
+    //Sync_System( system );
 
     //Allocate_System( system, system->local_cap, system->total_cap, msg );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: n=%d local_cap=%d\n",
              system->my_rank, system->n, system->local_cap );
@@ -204,109 +212,32 @@ int Init_System( reax_system *system, control_params *control,
 
     Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
     Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
-    // if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
-    //   return FAILURE;
+//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
+//    {
+//        return FAILURE;
+//    }
 
     /* initialize velocities so that desired init T can be attained */
     if ( !control->restart || (control->restart && control->random_vel) )
+    {
         Generate_Initial_Velocities( system, control->T_init );
+    }
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 
     return SUCCESS;
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Init_System( reax_system *system, control_params *control,
-                      simulation_data *data, storage *workspace,
-                      mpi_datatypes *mpi_data, char *msg )
-{
-    int i;
-    reax_atom *atom;
-    int nrecv[MAX_NBRS];
-
-    Setup_New_Grid( system, control, MPI_COMM_WORLD );
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d GRID:\n", system->my_rank );
-    Print_Grid( &(system->my_grid), stderr );
-#endif
-    Bin_My_Atoms( system, &(workspace->realloc) );
-    Reorder_My_Atoms( system, workspace );
-
-    /* estimate N and total capacity */
-    for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = 0;
-    MPI_Barrier( MPI_COMM_WORLD );
-    system->max_recved = 0;
-    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-                          Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
-    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
-    Bin_Boundary_Atoms( system );
-
-//MPI_ABORT( MPI_COMM_WORLD, -1);
-//sudhir
-
-
-#if defined(__CUDA_DEBUG_LOG__)
-    //fprintf (stderr, "After first SendRecv: N: %d, total_cap: %d \n",
-    //                      system->N, system->total_cap);
-#endif
-
-    /* estimate numH and Hcap */
-    system->numH = 0;
-    if ( control->hbond_cut > 0 )
-        //TODO
-        //for( i = 0; i < system->n; ++i ) {
-        for ( i = 0; i < system->N; ++i )
-        {
-            atom = &(system->my_atoms[i]);
-            atom->Hindex = i;
-            //FIX - 4 - Added fix for HBond Issue
-            if ( system->reax_param.sbp[ atom->type ].p_hbond == 1 )
-                system->numH++;
-            //else atom->Hindex = -1;
-        }
-    system->Hcap = MAX( system->numH * SAFER_ZONE, MIN_CAP );
-    
-    //Allocate_System( system, system->local_cap, system->total_cap, msg );
-
-    //Sync atoms here to continue the computation
-    //fprintf (stderr, " N:%d after sendrecv \n");
-    dev_alloc_system (system);
-    Sync_System (system);
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
-    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
-    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
-#endif
-
-    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
-    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
-    // if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
-    //   return FAILURE;
-
-    /* initialize velocities so that desired init T can be attained */
-    if ( !control->restart || (control->restart && control->random_vel) )
-        Generate_Initial_Velocities( system, control->T_init );
-
-    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-    return SUCCESS;
-}
-#endif
-
-
 /************************ initialize simulation data ************************/
-int Init_Simulation_Data( reax_system *system, control_params *control,
-                          simulation_data *data, char *msg )
+void Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, char *msg )
 {
     Reset_Simulation_Data( data );
 
     if ( !control->restart )
+    {
         data->step = data->prev_steps = 0;
+    }
 
     switch ( control->ensemble )
     {
@@ -323,8 +254,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         break;
 
     case nhNVT:
-        fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
-        //return FAILURE;
+        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
         data->N_f = 3 * system->bigN + 1;
         Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
         control->virial = 0;
@@ -343,7 +273,9 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         Evolve = Velocity_Verlet_Berendsen_NPT;
         control->virial = 1;
         if ( !control->restart )
+        {
             Reset_Pressures( data );
+        }
         break;
 
     case iNPT: /* Isotropic NPT */
@@ -351,12 +283,15 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         Evolve = Velocity_Verlet_Berendsen_NPT;
         control->virial = 1;
         if ( !control->restart )
+        {
             Reset_Pressures( data );
+        }
         break;
 
     case NPT: /* Anisotropic NPT */
-        strcpy( msg, "init_simulation_data: option not yet implemented" );
-        return FAILURE;
+        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
 
         data->N_f = 3 * system->bigN + 9;
         Evolve = Velocity_Verlet_Berendsen_NPT;
@@ -365,7 +300,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
           data->therm.G_xi = control->Tau_T *
           (2.0 * data->my_en.e_Kin - data->N_f * K_B * control->T );
           data->therm.v_xi = data->therm.G_xi * control->dt;
-          data->iso_bar.eps = 0.33333 * log(system->box.volume);
+          data->iso_bar.eps = (1.0 / 3.0) * LOG(system->box.volume);
           data->inv_W = 1.0 /
           ( data->N_f * K_B * control->T * SQR(control->Tau_P) );
           Compute_Pressure( system, control, data, out_control );
@@ -373,8 +308,9 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         break;
 
     default:
-        strcpy( msg, "init_simulation_data: ensemble not recognized" );
-        return FAILURE;
+        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
     }
 
     /* initialize the timer(s) */
@@ -382,105 +318,16 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
     if ( system->my_rank == MASTER_NODE )
     {
         data->timing.start = Get_Time( );
-#if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
-#endif
-    }
-
-
-#if defined(DEBUG)
-    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
-#endif
-    return SUCCESS;
-}
 
-
-#ifdef HAVE_CUDA
-int Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
-                               simulation_data *data, char *msg )
-{
-    Reset_Simulation_Data( data );
-
-    if ( !control->restart )
-        data->step = data->prev_steps = 0;
-
-    switch ( control->ensemble )
-    {
-    case NVE:
-        data->N_f = 3 * system->bigN;
-        Cuda_Evolve = Velocity_Verlet_NVE;
-        control->virial = 0;
-        break;
-
-    case bNVT:
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
-        control->virial = 0;
-        break;
-
-    case nhNVT:
-        fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
-        //return FAILURE;
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-        control->virial = 0;
-        if ( !control->restart || (control->restart && control->random_vel) )
-        {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
-            data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->therm.v_xi_old = 0;
-            data->therm.xi = 0;
-        }
-        break;
-
-    case sNPT: /* Semi-Isotropic NPT */
-        data->N_f = 3 * system->bigN + 4;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case iNPT: /* Isotropic NPT */
-        data->N_f = 3 * system->bigN + 2;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case NPT: /* Anisotropic NPT */
-        strcpy( msg, "init_simulation_data: option not yet implemented" );
-        return FAILURE;
-
-        data->N_f = 3 * system->bigN + 9;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        break;
-
-    default:
-        strcpy( msg, "init_simulation_data: ensemble not recognized" );
-        return FAILURE;
-    }
-
-    /* initialize the timer(s) */
-    MPI_Barrier( MPI_COMM_WORLD );  // wait for everyone to come here
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.start = Get_Time( );
 #if defined(LOG_PERFORMANCE)
         Reset_Timing( &data->timing );
 #endif
     }
 
-
 #if defined(DEBUG)
     fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
 #endif
-    return SUCCESS;
 }
-#endif
 
 
 #elif defined(LAMMPS_REAX)
@@ -493,6 +340,7 @@ int Init_System( reax_system *system, char *msg )
 
     system->local_cap = (int)(system->n * SAFE_ZONE);
     system->total_cap = (int)(system->N * SAFE_ZONE);
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: local_cap=%d total_cap=%d\n",
              system->my_rank, system->local_cap, system->total_cap );
@@ -504,23 +352,21 @@ int Init_System( reax_system *system, char *msg )
 }
 
 
-int Init_Simulation_Data( reax_system *system, control_params *control,
+void Init_Simulation_Data( reax_system *system, control_params *control,
                           simulation_data *data, char *msg )
 {
     Reset_Simulation_Data( data );
+
 #if defined(LOG_PERFORMANCE)
     Reset_Timing( &data->timing );
 #endif
 
     //if( !control->restart )
     data->step = data->prev_steps = 0;
-
-    return SUCCESS;
 }
 #endif
 
 
-
 /************************ initialize workspace ************************/
 /* Initialize Taper params */
 void Init_Taper( control_params *control,  storage *workspace )
@@ -532,8 +378,10 @@ void Init_Taper( control_params *control,  storage *workspace )
     swa = control->nonb_low;
     swb = control->nonb_cut;
 
-    if ( fabs( swa ) > 0.01 )
+    if ( FABS( swa ) > 0.01 )
+    {
         fprintf( stderr, "Warning: non-zero lower Taper-radius cutoff\n" );
+    }
 
     if ( swb < 0 )
     {
@@ -541,7 +389,9 @@ void Init_Taper( control_params *control,  storage *workspace )
         MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
     }
     else if ( swb < 5 )
+    {
         fprintf( stderr, "Warning: very low Taper-radius cutoff: %f\n", swb );
+    }
 
     d1 = swb - swa;
     d7 = POW( d1, 7.0 );
@@ -558,170 +408,168 @@ void Init_Taper( control_params *control,  storage *workspace )
     workspace->Tap[2] = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
     workspace->Tap[1] = 140.0 * swa3 * swb3 / d7;
     workspace->Tap[0] = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                         7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+            7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
 }
 
 
-int Init_Workspace( reax_system *system, control_params *control,
-                    storage *workspace, char *msg )
+void Init_Workspace( reax_system *system, control_params *control,
+        storage *workspace, char *msg )
 {
-    int ret;
-
-    ret = Allocate_Workspace( system, control, workspace,
-                              system->local_cap, system->total_cap, msg );
-    if ( ret != SUCCESS )
-        return ret;
+    Allocate_Workspace( system, control, workspace, system->local_cap,
+            system->total_cap, msg );
 
     memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
     Reset_Workspace( system, workspace );
 
     /* Initialize the Taper function */
     Init_Taper( control, workspace );
-
-    return SUCCESS;
-}
-
-
-#ifdef HAVE_CUDA
-int Cuda_Init_Workspace( reax_system *system, control_params *control,
-                         storage *workspace, char *msg )
-{
-    int ret;
-
-    ret = dev_alloc_workspace ( system, control, dev_workspace,
-                                system->local_cap, system->total_cap, msg );
-    if ( ret != SUCCESS )
-        return ret;
-
-    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
-    Cuda_Reset_Workspace( system, workspace );
-
-    /* Initialize the Taper function */
-    Init_Taper( control, dev_workspace );
-
-    return SUCCESS;
 }
-#endif
 
 
 /************** setup communication data structures  **************/
 int Init_MPI_Datatypes( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data, char *msg )
+        mpi_datatypes *mpi_data, char *msg )
 {
-    int           i, block[11];
-    MPI_Aint      base, disp[11];
-    MPI_Datatype  type[11];
-    mpi_atom      sample;
+    int i, block[11];
+    MPI_Aint base, disp[11];
+    MPI_Datatype type[11];
+    mpi_atom sample;
     boundary_atom b_sample;
-    restart_atom  r_sample;
-    rvec          rvec_sample;
-    rvec2         rvec2_sample;
+    restart_atom r_sample;
+    rvec rvec_sample;
+    rvec2 rvec2_sample;
 
     /* setup the world */
     mpi_data->world = MPI_COMM_WORLD;
 
-    /* allocate mpi buffers  */
-    //ret = Allocate_MPI_Buffers( mpi_data, system->est_recv,
-    //              system->gcell_cap, system->my_nbrs, msg );
-    //tmp = 0;
-    //#if defined(DEBUG_FOCUS)
-    //for( i = 0; i < MAX_NBRS; ++i )
-    //if( i != MYSELF )
-    //  tmp += system->my_nbrs[i].est_send;
-
-    //fprintf( stderr, "p%d: allocated mpi_buffers: recv=%d send=%d total=%dMB\n",
-    //   system->my_rank, system->est_recv, tmp,
-    //   (int)((system->est_recv+tmp)*sizeof(boundary_atom)/(1024*1024)) );
-    //#endif
-    //if( ret != SUCCESS )
-    //  return ret;
-
     /* mpi_atom - [orig_id, imprt_id, type, num_bonds, num_hbonds, name,
                    x, v, f_old, s, t] */
     block[0] = block[1] = block[2] = block[3] = block[4] = 1;
-    block[5] = 8;
+    block[5] = MAX_ATOM_NAME_LEN;
     block[6] = block[7] = block[8] = 3;
     block[9] = block[10] = 4;
 
-    MPI_Address( &(sample.orig_id),    disp + 0 );
-    MPI_Address( &(sample.imprt_id),   disp + 1 );
-    MPI_Address( &(sample.type),       disp + 2 );
-    MPI_Address( &(sample.num_bonds),  disp + 3 );
-    MPI_Address( &(sample.num_hbonds), disp + 4 );
-    MPI_Address( &(sample.name),       disp + 5 );
-    MPI_Address( &(sample.x[0]),       disp + 6 );
-    MPI_Address( &(sample.v[0]),       disp + 7 );
-    MPI_Address( &(sample.f_old[0]),   disp + 8 );
-    MPI_Address( &(sample.s[0]),       disp + 9 );
-    MPI_Address( &(sample.t[0]),       disp + 10 );
-
-    base = (MPI_Aint)(&(sample));
-    for ( i = 0; i < 11; ++i ) disp[i] -= base;
+//    MPI_Get_address( &sample, &base );
+//    MPI_Get_address( &(sample.orig_id), disp + 0 );
+//    MPI_Get_address( &(sample.imprt_id), disp + 1 );
+//    MPI_Get_address( &(sample.type), disp + 2 );
+//    MPI_Get_address( &(sample.num_bonds), disp + 3 );
+//    MPI_Get_address( &(sample.num_hbonds), disp + 4 );
+//    MPI_Get_address( &(sample.name), disp + 5 );
+//    MPI_Get_address( &(sample.x[0]), disp + 6 );
+//    MPI_Get_address( &(sample.v[0]), disp + 7 );
+//    MPI_Get_address( &(sample.f_old[0]), disp + 8 );
+//    MPI_Get_address( &(sample.s[0]), disp + 9 );
+//    MPI_Get_address( &(sample.t[0]), disp + 10 );
+//    for ( i = 0; i < 11; ++i )
+//    {
+//        disp[i] -= base;
+//    }
+    disp[0] = offsetof( mpi_atom, orig_id );
+    disp[1] = offsetof( mpi_atom, imprt_id );
+    disp[2] = offsetof( mpi_atom, type );
+    disp[3] = offsetof( mpi_atom, num_bonds );
+    disp[4] = offsetof( mpi_atom, num_hbonds );
+    disp[5] = offsetof( mpi_atom, name );
+    disp[6] = offsetof( mpi_atom, x );
+    disp[7] = offsetof( mpi_atom, v );
+    disp[8] = offsetof( mpi_atom, f_old );
+    disp[9] = offsetof( mpi_atom, s );
+    disp[10] = offsetof( mpi_atom, t );
 
     type[0] = type[1] = type[2] = type[3] = type[4] = MPI_INT;
     type[5] = MPI_CHAR;
     type[6] = type[7] = type[8] = type[9] = type[10] = MPI_DOUBLE;
 
-    MPI_Type_struct( 11, block, disp, type, &(mpi_data->mpi_atom_type) );
+    MPI_Type_create_struct( 11, block, disp, type, &(mpi_data->mpi_atom_type) );
     MPI_Type_commit( &(mpi_data->mpi_atom_type) );
 
     /* boundary_atom - [orig_id, imprt_id, type, num_bonds, num_hbonds, x] */
     block[0] = block[1] = block[2] = block[3] = block[4] = 1;
     block[5] = 3;
 
-    MPI_Address( &(b_sample.orig_id),    disp + 0 );
-    MPI_Address( &(b_sample.imprt_id),   disp + 1 );
-    MPI_Address( &(b_sample.type),       disp + 2 );
-    MPI_Address( &(b_sample.num_bonds),  disp + 3 );
-    MPI_Address( &(b_sample.num_hbonds), disp + 4 );
-    MPI_Address( &(b_sample.x[0]),       disp + 5 );
-
-    base = (MPI_Aint)(&(b_sample));
-    for ( i = 0; i < 6; ++i ) disp[i] -= base;
+//    MPI_Get_address( &b_sample, &base );
+//    MPI_Get_address( &(b_sample.orig_id), disp + 0 );
+//    MPI_Get_address( &(b_sample.imprt_id), disp + 1 );
+//    MPI_Get_address( &(b_sample.type), disp + 2 );
+//    MPI_Get_address( &(b_sample.num_bonds), disp + 3 );
+//    MPI_Get_address( &(b_sample.num_hbonds), disp + 4 );
+//    MPI_Get_address( &(b_sample.x[0]), disp + 5 );
+//    for ( i = 0; i < 6; ++i )
+//    {
+//        disp[i] -= base;
+//    }
+    disp[0] = offsetof( boundary_atom, orig_id );
+    disp[1] = offsetof( boundary_atom, imprt_id );
+    disp[2] = offsetof( boundary_atom, type );
+    disp[3] = offsetof( boundary_atom, num_bonds );
+    disp[4] = offsetof( boundary_atom, num_hbonds );
+    disp[5] = offsetof( boundary_atom, x );
 
     type[0] = type[1] = type[2] = type[3] = type[4] = MPI_INT;
     type[5] = MPI_DOUBLE;
 
-    MPI_Type_struct( 6, block, disp, type, &(mpi_data->boundary_atom_type) );
+    MPI_Type_create_struct( 6, block, disp, type, &(mpi_data->boundary_atom_type) );
     MPI_Type_commit( &(mpi_data->boundary_atom_type) );
 
     /* mpi_rvec */
     block[0] = 3;
-    MPI_Address( &(rvec_sample[0]), disp + 0 );
-    base = disp[0];
-    for ( i = 0; i < 1; ++i ) disp[i] -= base;
+
+//    MPI_Get_address( &rvec_sample, &base );
+//    MPI_Get_address( &(rvec_sample[0]), disp + 0 );
+//    for ( i = 0; i < 1; ++i )
+//    {
+//        disp[i] -= base;
+//    }
+    disp[0] = 0;
+
     type[0] = MPI_DOUBLE;
-    MPI_Type_struct( 1, block, disp, type, &(mpi_data->mpi_rvec) );
+
+    MPI_Type_create_struct( 1, block, disp, type, &(mpi_data->mpi_rvec) );
     MPI_Type_commit( &(mpi_data->mpi_rvec) );
 
     /* mpi_rvec2 */
     block[0] = 2;
-    MPI_Address( &(rvec2_sample[0]), disp + 0 );
-    base = disp[0];
-    for ( i = 0; i < 1; ++i ) disp[i] -= base;
+
+//    MPI_Get_address( &rvec2_sample, &base );
+//    MPI_Get_address( &(rvec2_sample[0]), disp + 0 );
+//    for ( i = 0; i < 1; ++i )
+//    {
+//        disp[i] -= base;
+//    }
+    disp[0] = 0;
+
     type[0] = MPI_DOUBLE;
-    MPI_Type_struct( 1, block, disp, type, &(mpi_data->mpi_rvec2) );
+
+    MPI_Type_create_struct( 1, block, disp, type, &(mpi_data->mpi_rvec2) );
     MPI_Type_commit( &(mpi_data->mpi_rvec2) );
 
-    /* restart_atom - [orig_id, type, name[8], x, v] */
+    /* restart_atom - [orig_id, type, name, x, v] */
     block[0] = block[1] = 1 ;
-    block[2] = 8;
+    block[2] = MAX_ATOM_NAME_LEN;
     block[3] = block[4] = 3;
 
-    MPI_Address( &(r_sample.orig_id),    disp + 0 );
-    MPI_Address( &(r_sample.type),       disp + 1 );
-    MPI_Address( &(r_sample.name),       disp + 2 );
-    MPI_Address( &(r_sample.x[0]),       disp + 3 );
-    MPI_Address( &(r_sample.v[0]),       disp + 4 );
-
-    base = (MPI_Aint)(&(r_sample));
-    for ( i = 0; i < 5; ++i ) disp[i] -= base;
+//    MPI_Get_address( &r_sample, &base );
+//    MPI_Get_address( &(r_sample.orig_id), disp + 0 );
+//    MPI_Get_address( &(r_sample.type), disp + 1 );
+//    MPI_Get_address( &(r_sample.name), disp + 2 );
+//    MPI_Get_address( &(r_sample.x[0]), disp + 3 );
+//    MPI_Get_address( &(r_sample.v[0]), disp + 4 );
+//    for ( i = 0; i < 5; ++i )
+//    {
+//        disp[i] -= base;
+//    }
+    disp[0] = offsetof( restart_atom, orig_id );
+    disp[1] = offsetof( restart_atom, type );
+    disp[2] = offsetof( restart_atom, name );
+    disp[3] = offsetof( restart_atom, x );
+    disp[4] = offsetof( restart_atom, v );
 
     type[0] = type[1] = MPI_INT;
     type[2] = MPI_CHAR;
     type[3] = type[4] = MPI_DOUBLE;
 
-    MPI_Type_struct( 5, block, disp, type, &(mpi_data->restart_atom_type) );
+    MPI_Type_create_struct( 5, block, disp, type, &(mpi_data->restart_atom_type) );
     MPI_Type_commit( &(mpi_data->restart_atom_type) );
 
     return SUCCESS;
@@ -729,56 +577,50 @@ int Init_MPI_Datatypes( reax_system *system, storage *workspace,
 
 
 /********************** allocate lists *************************/
-int  Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, char *msg )
+int Init_Lists( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i, num_nbrs;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
     int *hb_top, *bond_top;
-    int nrecv[MAX_NBRS];
 
     //for( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = system->my_nbrs[i].est_recv;
     //system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-    //        Sort_Boundary_Atoms, Unpack_Exchange_Message, 1 );
+    //        Sort_Boundary_Atoms, Unpack_Exchange_Message, TRUE );
 
     num_nbrs = Estimate_NumNeighbors( system, lists );
-    if (!Make_List(system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR, *lists + FAR_NBRS))
-    {
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Make_List( system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR, *lists + FAR_NBRS );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-             system->my_rank, num_nbrs,
-             (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+            system->my_rank, num_nbrs,
+            (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
 #endif
 
     Generate_Neighbor_Lists( system, data, workspace, lists );
-    bond_top = (int*) calloc( system->total_cap, sizeof(int) );
-    hb_top = (int*) calloc( system->local_cap, sizeof(int) );
-    //hb_top = (int*) calloc( system->Hcap, sizeof(int) );
-    
+    bond_top = (int*) scalloc( system->total_cap, sizeof(int), "Init_Lists::bond_top" );
+    hb_top = (int*) scalloc( system->local_cap, sizeof(int), "Init_Lists::hb_top" );
+//    hb_top = (int*) scalloc( system->Hcap, sizeof(int), "Init_Lists::hb_top" );
     
     Estimate_Storages( system, control, lists,
-                       &Htop, hb_top, bond_top, &num_3body );
-  //Host_Estimate_Sparse_Matrix( system, control, lists, system->local_cap, system->total_cap,
-      //                      &Htop, hb_top, bond_top, &num_3body );
+            &Htop, hb_top, bond_top, &num_3body );
+//    Host_Estimate_Sparse_Matrix( system, control, lists, system->local_cap, system->total_cap,
+//            &Htop, hb_top, bond_top, &num_3body );
     
-  
-    
-    Allocate_Matrix( &(workspace->H), system->local_cap, Htop );
+    Allocate_Matrix( &(workspace->H), system->n, Htop );
     
     //MATRIX CHANGES
     //workspace->L = NULL;
     //workspace->U = NULL;
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
              system->my_rank, Htop,
              (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
 #endif
 
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
     {
         // init H indexes
         total_hbonds = 0;
@@ -788,16 +630,11 @@ int  Init_Lists( reax_system *system, control_params *control,
             total_hbonds += hb_top[i];
         }
         total_hbonds = MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS );
+        // DANIEL, to make Mpi_Not_Gpu_Validate_Lists() not complain that max_hbonds is 0
+        system->max_hbonds = total_hbonds * SAFER_ZONE;
 
-       // DANIEL, to make Mpi_Not_Gpu_Validate_Lists() not complain that system->max_bonds is 0
-       system->max_hbonds = total_hbonds * SAFER_ZONE;
+        Make_List( system->Hcap, total_hbonds, TYP_HBOND, *lists + HBONDS );
 
-
-        if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND, *lists + HBONDS) )
-        {
-            fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
                  system->my_rank, total_hbonds,
@@ -806,25 +643,18 @@ int  Init_Lists( reax_system *system, control_params *control,
     }
 
     /* bonds list */
-    //Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
-    //num_bonds = bond_top[system->N-1];
     total_bonds = 0;
     for ( i = 0; i < system->N; ++i )
     {
         system->my_atoms[i].num_bonds = bond_top[i];
         total_bonds += bond_top[i];
+        // DANIEL, to make Mpi_Not_Gpu_Validate_Lists() not complain that max_bonds is 0
+        system->max_bonds[i] = MAX( bond_top[i], MIN_BONDS );
     }
     bond_cap = MAX( total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS );
- 
 
-    // DANIEL, to make Mpi_Not_Gpu_Validate_Lists() not complain that system->max_bonds is 0
-    system->max_bonds = total_bonds * SAFER_ZONE;
+    Make_List( system->total_cap, bond_cap, TYP_BOND, *lists + BONDS);
 
-    if ( !Make_List( system->total_cap, bond_cap, TYP_BOND, *lists + BONDS) )
-    {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
              system->my_rank, bond_cap,
@@ -833,11 +663,8 @@ int  Init_Lists( reax_system *system, control_params *control,
 
     /* 3bodies list */
     cap_3body = MAX( num_3body * SAFE_ZONE, MIN_3BODIES );
-    if ( !Make_List(bond_cap, cap_3body, TYP_THREE_BODY, *lists + THREE_BODIES) )
-    {
-        fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Make_List(bond_cap, cap_3body, TYP_THREE_BODY, *lists + THREE_BODIES);
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
              system->my_rank, cap_3body,
@@ -845,254 +672,34 @@ int  Init_Lists( reax_system *system, control_params *control,
 #endif
 
 #if defined(TEST_FORCES)
-    if (!Make_List(system->total_cap, bond_cap * 8, TYP_DDELTA, (*lists) + DDELTAS))
-    {
-        fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Make_List(system->total_cap, bond_cap * 8, TYP_DDELTA, (*lists) + DDELTAS);
+
     fprintf( stderr, "p%d: allocated dDelta list: num_ddelta=%d space=%ldMB\n",
              system->my_rank, bond_cap * 30,
              bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
 
-    if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, (*lists) + DBOS) )
-    {
-        fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
+    Make_List( bond_cap, bond_cap * 50, TYP_DBO, (*lists) + DBOS);
+
     fprintf( stderr, "p%d: allocated dbond list: num_dbonds=%d space=%ldMB\n",
              system->my_rank, bond_cap * MAX_BONDS * 3,
              bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
-    free( hb_top );
-    free( bond_top );
+    sfree( hb_top, "Init_Lists::hb_top" );
+    sfree( bond_top, "Init_Lists::bond_top" );
 
     return SUCCESS;
 }
 
 
-#ifdef HAVE_CUDA
-int  Cuda_Init_Lists( reax_system *system, control_params *control,
-                      simulation_data *data, storage *workspace, reax_list **lists,
-                      mpi_datatypes *mpi_data, char *msg )
-{
-    int i, num_nbrs;
-    int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
-    int *hb_top, *bond_top;
-    int nrecv[MAX_NBRS];
-
-    int *nbr_indices = (int *) host_scratch;
-
-    //num_nbrs = Estimate_NumNeighbors( system, lists );
-    Cuda_Estimate_Neighbors (system, nbr_indices);
-    num_nbrs = 0;
-    //for (i = 0; i < 20; i++)
-    //fprintf (stderr, "atom: %d -- %d \n", i, nbr_indices[i]);
-
-    for (i = 0; i < system->N; i++)
-        num_nbrs += nbr_indices [i] ;
-
-    //fprintf (stderr, "DEVICE Total Neighbors: %d (%d)\n", num_nbrs, (int)(num_nbrs*SAFE_ZONE));
-
-    for (i = 0; i < system->N; i++)
-        nbr_indices[i] = MAX (nbr_indices [i] * SAFER_ZONE, MIN_NBRS);
-
-    num_nbrs = 0;
-    num_nbrs += nbr_indices [0] ;
-    for (i = 1; i < system->N; i++)
-    {
-        num_nbrs += nbr_indices [i] ;
-        nbr_indices [i] += nbr_indices [i - 1];
-    }
-
-    //fprintf (stderr, "DEVICE total neighbors entries: %d \n", nbr_indices [system->N - 1] );
-
-    if (!Dev_Make_List(system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS))
-    {
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-             system->my_rank, num_nbrs,
-             (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
-#endif
-
-    //fprintf (stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap);
-    Cuda_Init_Neighbors_Indices (nbr_indices, system->N);
-
-    Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-
-    bond_top = (int*) calloc( system->total_cap, sizeof(int) );
-    //hb_top = (int*) calloc( system->local_cap, sizeof(int) );
-    hb_top = (int*) calloc( system->total_cap, sizeof(int) );
-    Cuda_Estimate_Storages( system, control, lists, system->local_cap, system->total_cap,
-                            &Htop, hb_top, bond_top, &num_3body );
-
-
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-
-    Cuda_Estimate_Sparse_Matrix (system, control, data, lists);
-    //dev_alloc_matrix ( &(dev_workspace->H), system->local_cap, system->n * system->max_sparse_entries);
-    //dev_alloc_matrix ( &(dev_workspace->H), system->total_cap, system->N * system->max_sparse_entries);
-    dev_alloc_matrix ( &(dev_workspace->H), system->total_cap, system->total_cap * system->max_sparse_entries);
-    dev_workspace->H.n = system->n;
-    //THIS IS INITIALIZED in the init_forces function to system->n
-    //but this is never used in the code.
-    //GPU maintains the H matrix to be (NXN) symmetric matrix.
-
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-    //TODO - CARVER FIX
-
-    //MATRIX CHANGES
-    //workspace->L = NULL;
-    //workspace->U = NULL;
-
-#if defined(DEBUG_FOCUS)
-    fprintf (stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
-             system->my_rank, system->max_sparse_entries, dev_workspace->H.m);
-    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-             system->my_rank, Htop,
-             (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
-#endif
-
-    // FIX - 4 - Added addition check here for hydrogen Bonds
-    if (( control->hbond_cut > 0 ) && (system->numH))
-    {
-        /* init H indexes */
-        total_hbonds = 0;
-        int count = 0;
-        //TODO
-        //for( i = 0; i < system->n; ++i ) {
-        for ( i = 0; i < system->N; ++i )
-        {
-            //system->my_atoms[i].num_hbonds = hb_top[i];
-            //TODO
-            hb_top [i] = MAX( hb_top[i] * 4, MIN_HBONDS * 4);
-            total_hbonds += hb_top[i];
-            if (hb_top [i] > 0) count ++;
-        }
-        total_hbonds = MAX( total_hbonds, MIN_CAP * MIN_HBONDS );
-
-        //fprintf (stderr, "HCap value is --> %d, system->n is : %d (%d)\n", system->Hcap, system->n, count);
-        //fprintf (stderr, "Total Hydrogen Bonds --> %d ** misc %d \n", total_hbonds, hb_top[4021] );
-        //if( !Dev_Make_List( system->local_cap, total_hbonds, TYP_HBOND, *dev_lists+HBONDS) ) {
-
-        /*************/
-        //CHANGE ORIGINAL
-        //if( !Dev_Make_List( system->total_cap, total_hbonds, TYP_HBOND, *dev_lists+HBONDS) ) {
-        if ( !Dev_Make_List( system->total_cap, system->total_cap * system->max_hbonds, TYP_HBOND, *dev_lists + HBONDS) )
-        {
-            /**************/
-            fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-        }
-
-#if defined(DEBUG_FOCUS)
-        fprintf (stderr, "**** Total HBonds allocated --> %d total_cap: %d per atom: %d, max_hbonds: %d \n",
-                 total_hbonds, system->total_cap, (total_hbonds / system->total_cap), system->max_hbonds );
-#endif
-
-        //TODO
-        //Cuda_Init_HBond_Indices (hb_top, system->n);
-        /****/
-        //THIS IS COMMENTED OUT - CHANGE ORIGINAL
-        //Cuda_Init_HBond_Indices (hb_top, system->N);
-        //THIS IS COMMENTED OUT - CHANGE ORIGINAL
-        /****/
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                 system->my_rank, total_hbonds,
-                 (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
-#endif
-    }
-
-    /* bonds list */
-    total_bonds = 0;
-    for ( i = 0; i < system->N; ++i )
-    {
-        //system->my_atoms[i].num_bonds = bond_top[i];
-        num_3body += SQR (bond_top [i]);
-        total_bonds += MAX (bond_top[i] * 4, MIN_BONDS);
-    }
-    bond_cap = MAX( total_bonds, MIN_CAP * MIN_BONDS );
-
-#if defined(DEBUG)
-    fprintf (stderr, "**** Total Bonds allocated --> %d total_cap: %d per atom: %d, max_bonds: %d \n",
-             bond_cap, system->total_cap, (bond_cap / system->total_cap), system->max_bonds );
-#endif
-
-
-    /***************/
-    //CHANGE ORIGINAL
-    //if( !Dev_Make_List( system->total_cap, bond_cap, TYP_BOND, *dev_lists + BONDS) ) {
-    if ( !Dev_Make_List( system->total_cap, system->total_cap * system->max_bonds, TYP_BOND, *dev_lists + BONDS) )
-    {
-        /***************/
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-
-    //TODO
-    //if( !Make_List( system->total_cap, bond_cap, TYP_BOND, *lists + BONDS) ) {
-    if ( !Make_List( system->total_cap, system->total_cap * system->max_bonds, TYP_BOND, *lists + BONDS) )
-    {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-    }
-    //TODO
-    /****/
-    //CHANGE ORIGINAL
-    //Cuda_Init_Bond_Indices (bond_top, system->N, bond_cap);
-    //CHANGE ORIGINAL
-    /****/
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-             system->my_rank, bond_cap,
-             (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
-#endif
-
-    /* 3bodies list */
-    //cap_3body = MAX( num_3body*SAFE_ZONE, MIN_3BODIES );
-    cap_3body = MAX( num_3body * SAFE_ZONE, MIN_3BODIES );
-    //fprintf (stderr, "Total 3 bodies n: %d num_intrs: %d \n", bond_cap, bond_cap * MAX_THB_INTRS);
-    //if( !Dev_Make_List(bond_cap, cap_3body, TYP_THREE_BODY, *lists + THREE_BODIES) ){
-//  if( !Dev_Make_List(bond_cap, bond_cap * MAX_THB_INTRS, TYP_THREE_BODY, *dev_lists + THREE_BODIES) ){
-//    fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-//    MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-//  }
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
-             system->my_rank, cap_3body,
-             (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
-#endif
-
-    free( hb_top );
-    free( bond_top );
-
-    return SUCCESS;
-}
-#endif
-
-
 #if defined(PURE_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
 
-    host_scratch = (void *)malloc (HOST_SCRATCH_SIZE );
+    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "Initialize::host_scratch" );
 
     char msg[MAX_STR];
 
@@ -1104,6 +711,7 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized mpi datatypes\n", system->my_rank );
 #endif
@@ -1115,29 +723,19 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: system initialized\n", system->my_rank );
 #endif
 
-    if ( Init_Simulation_Data( system, control, data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Simulation_Data( system, control, data, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized simulation data\n", system->my_rank );
 #endif
 
-    if ( Init_Workspace( system, control, workspace, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Workspace( system, control, workspace, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
 #endif
@@ -1150,72 +748,67 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
 #endif
 
-    if (Init_Output_Files(system, control, out_control, mpi_data, msg) == FAILURE)
+    if ( Init_Output_Files(system, control, out_control, mpi_data, msg) == FAILURE)
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: could not open output files! terminating...\n",
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: output files opened\n", system->my_rank );
 #endif
 
     if ( control->tabulate )
     {
-        if ( Init_Lookup_Tables(system, control, workspace, mpi_data, msg) == FAILURE )
+        if ( Init_Lookup_Tables(system, control, workspace->Tap, mpi_data, msg) == FAILURE )
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
                      system->my_rank );
             MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
         }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
 #endif
     }
 
     Init_Force_Functions( control );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized force functions\n", system->my_rank );
 #endif
+
     /*#ifdef TEST_FORCES
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
       #endif */
 }
 
+
 void Pure_Initialize( reax_system *system, control_params *control,
-                      simulation_data *data, storage *workspace,
-                      reax_list **lists, output_controls *out_control,
-                      mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     char msg[MAX_STR];
 
-    if ( Init_Simulation_Data( system, control, data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Simulation_Data( system, control, data, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized simulation data\n", system->my_rank );
 #endif
     fprintf( stderr, "p%d: pure initialized simulation data\n", system->my_rank );
 
-    if ( Init_Workspace( system, control, workspace, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Workspace( system, control, workspace, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
 #endif
@@ -1238,127 +831,16 @@ void Pure_Initialize( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Initialize( reax_system *system, control_params *control,
-                      simulation_data *data, storage *workspace,
-                      reax_list **lists, output_controls *out_control,
-                      mpi_datatypes *mpi_data )
-{
-    char msg[MAX_STR];
-    real t_start, t_end;
-
-    //HOST SCRATCH and Device SCRATCH
-    Cuda_Init_ScratchArea ();
-
-    //MPI_DATATYPES
-    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    //SYSTEM
-    if ( Cuda_Init_System(system, control, data, workspace, mpi_data, msg) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    //GRID
-    dev_alloc_grid (system);
-    Sync_Grid (&system->my_grid, &system->d_my_grid);
-
-    //validate_grid (system);
-    //exit (0);
-
-    //SIMULATION_DATA
-    dev_alloc_simulation_data (data);
-    if ( Cuda_Init_Simulation_Data( system, control, data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    //WORKSPACE
-    if ( Cuda_Init_Workspace( system, control, workspace, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
-#endif
-    //Sync the taper here from host to device.
-
-    //CONTROL
-    dev_alloc_control (control);
-
-    //LISTS
-    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
-            FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
-#endif
-
-    //OUTPUT Files
-    if (Init_Output_Files(system, control, out_control, mpi_data, msg) == FAILURE)
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
-#endif
-
-    //Lookup Tables
-    if ( control->tabulate )
-    {
-        if ( Init_Lookup_Tables(system, control, dev_workspace->Tap, mpi_data, msg) == FAILURE )
-        {
-            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
-            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-        }
-
-#if defined(DEBUG)
-        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
-#endif
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
-#endif
-}
-#endif
-
-
 #elif defined(LAMMPS_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     char msg[MAX_STR];
-    host_scratch = (void *)malloc (HOST_SCRATCH_SIZE );
+
+    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "Initialize::host_scratch" );
+
     if ( Init_System(system, msg) == FAILURE )
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
@@ -1366,29 +848,19 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: system initialized\n", system->my_rank );
 #endif
 
-    if ( Init_Simulation_Data( system, control, data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Simulation_Data( system, control, data, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized simulation data\n", system->my_rank );
 #endif
 
-    if ( Init_Workspace( system, control, workspace, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Init_Workspace( system, control, workspace, msg );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
 #endif
@@ -1401,6 +873,7 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized mpi datatypes\n", system->my_rank );
 #endif
@@ -1412,6 +885,7 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
 #endif
@@ -1423,29 +897,32 @@ void Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: output files opened\n", system->my_rank );
 #endif
 
     if ( control->tabulate )
     {
-        if ( Init_Lookup_Tables( system, control, workspace, mpi_data, msg ) == FAILURE )
+        if ( Init_Lookup_Tables( system, control, workspace->Tap, mpi_data, msg ) == FAILURE )
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
                      system->my_rank );
             MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
         }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
 #endif
     }
 
-
     Init_Force_Functions( );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized force functions\n", system->my_rank );
 #endif
+
     /*#if defined(TEST_FORCES)
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
diff --git a/PG-PuReMD/src/init_md.h b/PG-PuReMD/src/init_md.h
index f1f50765807ebeac6f737155a2035d9a325e0752..c5222cbd3d2b2a4d2ddef5fe3b976e7d0644ffab 100644
--- a/PG-PuReMD/src/init_md.h
+++ b/PG-PuReMD/src/init_md.h
@@ -24,13 +24,26 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Generate_Initial_Velocities( reax_system *, real );
+
+int Init_MPI_Datatypes( reax_system *, storage *, mpi_datatypes *, char * );
+
 void Initialize( reax_system*, control_params*, simulation_data*,
-                 storage*, reax_list**, output_controls*, mpi_datatypes* );
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
 
 void Pure_Initialize( reax_system*, control_params*, simulation_data*,
-                      storage*, reax_list**, output_controls*, mpi_datatypes* );
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+void Init_Taper( control_params *,  storage * );
+
+#ifdef __cplusplus
+}
+#endif
 
-void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
-                      storage*, reax_list**, output_controls*, mpi_datatypes* );
 
 #endif
diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c
index 79bd1f2ebaf6f55c66fab6042ea891e24b992c6a..d3e9239f38e5b826b871ade2a6ade78f6b180c66 100644
--- a/PG-PuReMD/src/integrate.c
+++ b/PG-PuReMD/src/integrate.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "integrate.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "comm_tools.h"
@@ -32,20 +35,13 @@
 #include "tool_box.h"
 #include "vector.h"
 
-#ifdef HAVE_CUDA
-#include "cuda_integrate.h"
-#include "cuda_copy.h"
-
-#include "cuda_neighbors.h"
-#endif
-
 
-void Velocity_Verlet_NVE( reax_system* system, control_params* control,
-                          simulation_data *data, storage *workspace,
-                          reax_list **lists, output_controls *out_control,
-                          mpi_datatypes *mpi_data )
+int Velocity_Verlet_NVE( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int  i, steps, renbr;
+    int i, steps, renbr, ret;
+    static int verlet_part1_done = FALSE;
     real inv_m, dt, dt_sqr;
     rvec dx;
     reax_atom *atom;
@@ -54,53 +50,72 @@ void Velocity_Verlet_NVE( reax_system* system, control_params* control,
     fprintf( stderr, "p%d @ step %d\n", system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
     dt = control->dt;
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+
+    ReAllocate( system, control, data, workspace, lists, mpi_data );
 
-    for ( i = 0; i < system->n; i++ )
+    if ( verlet_part1_done == FALSE )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        rvec_ScaledSum( dx, dt, atom->v, 0.5 * dt_sqr * -F_CONV * inv_m, atom->f );
-        rvec_Add( system->my_atoms[i].x, dx );
-        rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    }
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            /* Compute x(t + dt) */
+            rvec_ScaledSum( dx, dt, atom->v, 0.5 * dt_sqr * -F_CONV * inv_m, atom->f );
+            rvec_Add( system->my_atoms[i].x, dx );
+            /* Compute v(t + dt/2) */
+            rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+        }
+        verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    ReAllocate( system, control, data, workspace, lists, mpi_data );
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+    }
+
     Reset( system, control, data, workspace, lists );
+
     if ( renbr )
     {
         Generate_Neighbor_Lists( system, data, workspace, lists );
     }
-    Compute_Forces(system, control, data, workspace, lists, out_control, mpi_data);
 
-    for ( i = 0; i < system->n; i++ )
+    ret = Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data );
+
+    if ( ret == SUCCESS )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[ atom->type ].mass;
-        rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    }
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[ atom->type ].mass;
+            rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+        }
 
+        verlet_part1_done = FALSE;
+    }
+    
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
+    return ret;
 }
 
 
-void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
+int Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
         control_params* control, simulation_data *data, storage *workspace,
         reax_list **lists, output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int  i, itr, steps, renbr;
+    int i, itr, steps, renbr, ret;
+    static int verlet_part1_done = FALSE;
     real inv_m, coef_v;
     real dt, dt_sqr;
     real my_ekin, new_ekin;
@@ -118,92 +133,108 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
     dt_sqr = SQR(dt);
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+
+    ReAllocate( system, control, data, workspace, lists, mpi_data );
 
-    for ( i = 0; i < system->n; i++ )
+    if ( verlet_part1_done == FALSE )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        rvec_ScaledSum( dx, dt, atom->v, 0.5 * dt_sqr * -F_CONV * inv_m, atom->f );
-        rvec_Add( system->my_atoms[i].x, dx );
-        rvec_Copy( atom->f_old, atom->f );
-    }
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            rvec_ScaledSum( dx, dt, atom->v, 0.5 * dt_sqr * -F_CONV * inv_m, atom->f );
+            rvec_Add( system->my_atoms[i].x, dx );
+            rvec_Copy( atom->f_old, atom->f );
+        }
+    
+        /* Compute xi(t + dt) */
+        therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
 
-    /* Compute xi(t + dt) */
-    therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+        verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    ReAllocate( system, control, data, workspace, lists, mpi_data );
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+    }
+
     Reset( system, control, data, workspace, lists );
+
     if ( renbr )
     {
         Generate_Neighbor_Lists( system, data, workspace, lists );
     }
-    Compute_Forces( system, control, data, workspace, lists,
-                    out_control, mpi_data );
 
-    /* Compute iteration constants for each atom's velocity */
-    for ( i = 0; i < system->n; ++i )
-    {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        rvec_Scale( workspace->v_const[i], 1.0 - 0.5 * dt * therm->v_xi, atom->v );
-        rvec_ScaledAdd( workspace->v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f_old );
-        rvec_ScaledAdd( workspace->v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f );
-    }
+    ret = Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data );
 
-    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
-    my_ekin = G_xi_new = v_xi_old = 0;
-    itr = 0;
-    do
+    if ( ret == SUCCESS )
     {
-        itr++;
-
-        /* new values become old in this iteration */
-        v_xi_old = v_xi_new;
-
-        my_ekin = 0;
+        /* Compute iteration constants for each atom's velocity */
         for ( i = 0; i < system->n; ++i )
         {
             atom = &(system->my_atoms[i]);
-            coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-            rvec_Scale( atom->v, coef_v, workspace->v_const[i] );
-            my_ekin +=
-                (0.5 * system->reax_param.sbp[atom->type].mass * rvec_Dot(atom->v, atom->v));
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            rvec_Scale( workspace->v_const[i], 1.0 - 0.5 * dt * therm->v_xi, atom->v );
+            rvec_ScaledAdd( workspace->v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f_old );
+            rvec_ScaledAdd( workspace->v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f );
         }
+    
+        v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+        my_ekin = G_xi_new = v_xi_old = 0;
+        itr = 0;
+        do
+        {
+            itr++;
+    
+            /* new values become old in this iteration */
+            v_xi_old = v_xi_new;
+    
+            my_ekin = 0;
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &(system->my_atoms[i]);
+                coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+                rvec_Scale( atom->v, coef_v, workspace->v_const[i] );
+                my_ekin +=
+                    (0.5 * system->reax_param.sbp[atom->type].mass * rvec_Dot(atom->v, atom->v));
+            }
+    
+            MPI_Allreduce( &my_ekin, &new_ekin, 1, MPI_DOUBLE, MPI_SUM,
+                    mpi_data->comm_mesh3D  );
+    
+            G_xi_new = control->Tau_T * ( 2.0 * new_ekin - data->N_f * K_B * control->T );
+            v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        }
+        while ( FABS(v_xi_new - v_xi_old) > 1e-5 );
+        therm->v_xi_old = therm->v_xi;
+        therm->v_xi = v_xi_new;
+        therm->G_xi = G_xi_new;
 
-        MPI_Allreduce( &my_ekin, &new_ekin, 1, MPI_DOUBLE, MPI_SUM,
-                       mpi_data->comm_mesh3D  );
-
-        G_xi_new = control->Tau_T * ( 2.0 * new_ekin - data->N_f * K_B * control->T );
-        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
-    }
-    while ( fabs(v_xi_new - v_xi_old) > 1e-5 );
-    therm->v_xi_old = therm->v_xi;
-    therm->v_xi = v_xi_new;
-    therm->G_xi = G_xi_new;
+        verlet_part1_done = FALSE;
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: T-coupling\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: T-coupling\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
+    }
+
+    return ret;
 }
 
 
 /* uses Berendsen-type coupling for both T and P.
    All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int i, steps, renbr;
-    real inv_m, dt, lambda;
+    int i, steps, renbr, ret;
+    static int verlet_part1_done = FALSE;
+    real inv_m, dt, dt_sqr, lambda;
     rvec dx;
     reax_atom *atom;
 
@@ -213,280 +244,105 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control
 #endif
     dt = control->dt;
     steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    dt_sqr = SQR(dt);
 
-    /* velocity verlet, 1st part */
-    for ( i = 0; i < system->n; i++ )
+    ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-        rvec_Add( atom->x, dx );
-        /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-    }
+        /* velocity verlet, 1st part */
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * dt_sqr, atom->f );
+            rvec_Add( atom->x, dx );
+            rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+        }
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    ReAllocate( system, control, data, workspace, lists, mpi_data );
-    if ( renbr )
-    {
-        Update_Grid( system, control, mpi_data->world );
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
     }
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+
     Reset( system, control, data, workspace, lists );
+
     if ( renbr )
     {
         Generate_Neighbor_Lists( system, data, workspace, lists );
     }
-    Compute_Forces( system, control, data, workspace,
-                    lists, out_control, mpi_data );
 
-    /* velocity verlet, 2nd part */
-    for ( i = 0; i < system->n; i++ )
-    {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        /* Compute v(t + dt) */
-        rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-    /* temperature scaler */
-    Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if ( lambda < MIN_dT )
-    {
-        lambda = MIN_dT;
-    }
-    else if (lambda > MAX_dT )
-    {
-        lambda = MAX_dT;
-    }
-    lambda = SQRT( lambda );
+    ret = Compute_Forces( system, control, data, workspace,
+            lists, out_control, mpi_data );
 
-    /* Scale velocities and positions at t+dt */
-    for ( i = 0; i < system->n; ++i )
+    if ( ret == SUCCESS )
     {
-        atom = &(system->my_atoms[i]);
-        rvec_Scale( atom->v, lambda, atom->v );
-    }
-    Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: scaled velocities\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system,
-        control_params* control,
-        simulation_data *data,
-        storage *workspace,
-        reax_list **lists,
-        output_controls *out_control,
-        mpi_datatypes *mpi_data )
-{
-    int i, steps, renbr;
-    real inv_m, dt, lambda;
-    rvec dx;
-    reax_atom *atom;
-
-    int *nbr_indices, num_nbrs;
-    int *bond_top, *hb_top;
-    int Htop, num_3body;
-    int total_hbonds, count, total_bonds;
-    int bond_cap, cap_3body;
-
-    real t_over_start, t_over_elapsed;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-    dt = control->dt;
-    steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
-
-    /* velocity verlet, 1st part */
-    bNVT_update_velocity_part1 (system, dt);
+        /* velocity verlet, 2nd part */
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            /* Compute v(t + dt) */
+            rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+        }
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
-    if ( renbr )
-        Update_Grid( system, control, mpi_data->world );
-
-    Output_Sync_Atoms (system);
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
-    Sync_Atoms ( system );
-
-    //Synch the Grid to the Device here
-    Sync_Grid (&system->my_grid, &system->d_my_grid );
-
-    init_blocks (system);
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf (stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
-             system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE);
-#endif
-
-    //Reset( system, control, data, workspace, lists );
-    Cuda_Reset( system, control, data, workspace, lists );
-
-    if ( renbr )
-    {
-#if defined(DEBUG)
-        t_over_start  = Get_Time ();
-#endif
-
-        nbr_indices = (int *) host_scratch;
-        memset (nbr_indices, 0, sizeof (int) * system->N);
-
-        Cuda_Estimate_Neighbors (system, nbr_indices);
-
-        num_nbrs = 0;
-        for (i = 0; i < system->N; i++)
-            num_nbrs += nbr_indices [i];
-
-        num_nbrs = 0;
-        for (i = 0; i < system->N; i++)
+        /* temperature scaler */
+        Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+        if ( lambda < MIN_dT )
         {
-            nbr_indices [i] = MAX (nbr_indices[i] * SAFE_ZONE, MIN_NBRS);
-            num_nbrs += nbr_indices [i];
+            lambda = MIN_dT;
         }
-
-        if (num_nbrs >= (*dev_lists + FAR_NBRS)->num_intrs)
+        else if (lambda > MAX_dT )
         {
-            fprintf (stderr, "p%d: Total neighbors: %d is greater than available entries: %d \n",
-                     system->my_rank, num_nbrs, (*dev_lists + FAR_NBRS)->num_intrs);
-            exit (0);
-        }
-
-        for (i = 1; i < system->N; i++)
-            nbr_indices [i] += nbr_indices [i - 1];
-
-        Cuda_Init_Neighbors_Indices (nbr_indices, system->N);
-        Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-
-        /*
-        memset (host_scratch, 0, sizeof (int) * (2 * system->total_cap));
-        bond_top = (int *) host_scratch;
-        hb_top = bond_top + system->total_cap;
-        Htop = 0;
-        num_3body = 0;
-
-        Cuda_Estimate_Storages( system, control, lists, system->local_cap, system->total_cap,
-             &Htop, hb_top, bond_top, &num_3body );
-
-        if( control->hbond_cut > 0 ) {
-        total_hbonds = 0;
-        count = 0;
-
-        for( i = 0; i < system->N; ++i ) {
-        hb_top [i] = MAX( hb_top[i] * 2, MIN_HBONDS * 2);
-        total_hbonds += hb_top[i];
-        if (hb_top [i] > 0) count ++;
-        }
-        total_hbonds = MAX( total_hbonds, MIN_CAP*MIN_HBONDS );
-
-        if (total_hbonds >= (*dev_lists + HBONDS)->num_intrs){
-            fprintf (stderr, "p%d: Total HBonds: %d and allocated: %d \n",
-                                    system->my_rank, total_hbonds, (*dev_lists + HBONDS)->num_intrs);
-            exit (0);
-        }
-        Cuda_Init_HBond_Indices (hb_top, system->N);
-        }
-
-        // bonds list
-        total_bonds = 0;
-        for( i = 0; i < system->N; ++i ) {
-        num_3body += SQR (bond_top [i]);
-        total_bonds += MAX (bond_top[i] * 2, MIN_BONDS);
+            lambda = MAX_dT;
         }
-        bond_cap = MAX( total_bonds, MIN_CAP*MIN_BONDS );
+        lambda = SQRT( lambda );
 
-        if (total_bonds >= (*dev_lists + BONDS)->num_intrs){
-            fprintf (stderr, "p:%d Bonds: %d and allocated: %d \n",
-                                    system->my_rank, total_hbonds, (*dev_lists + BONDS)->num_intrs);
-            exit (0);
+        /* Scale velocities and positions at t+dt */
+        for ( i = 0; i < system->n; ++i )
+        {
+            atom = &(system->my_atoms[i]);
+            rvec_Scale( atom->v, lambda, atom->v );
         }
-
-        Cuda_Init_Bond_Indices (bond_top, system->N, bond_cap);
-        */
-
-#if defined(DEBUG)
-        t_over_elapsed  = Get_Timing_Info (t_over_start);
-        fprintf (stderr, "p%d --> Overhead (Step-%d) %f \n",
-                 system->my_rank, data->step, t_over_elapsed);
-#endif
-    }
-
-    //Compute_Forces( system, control, data, workspace,
-    //          lists, out_control, mpi_data );
-    Cuda_Compute_Forces( system, control, data, workspace,
-                         lists, out_control, mpi_data );
-
-    /* velocity verlet, 2nd part */
-    bNVT_update_velocity_part2 (system, dt);
+        Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    /* temperature scaler */
-    //Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if ( lambda < MIN_dT )
-    {
-        lambda = MIN_dT;
+        verlet_part1_done = FALSE;
     }
-    else if (lambda > MAX_dT )
-    {
-        lambda = MAX_dT;
-    }
-    lambda = SQRT( lambda );
-
-    /* Scale velocities and positions at t+dt */
-    bNVT_scale_velocities (system, lambda);
 
-    //Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: scaled velocities\n",
-             system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
+    return ret;
 }
-#endif
 
 
 /* uses Berendsen-type coupling for both T and P.
-   All box dimensions are scaled by the same amount,
-   there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* control,
+ * All box dimensions are scaled by the same amount,
+ * there is no change in the angles between axes. */
+int Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* control,
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int i, steps, renbr;
+    int i, steps, renbr, ret;
+    static int verlet_part1_done = FALSE;
     real inv_m, dt;
     rvec dx;
     reax_atom *atom;
@@ -495,61 +351,78 @@ void Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* control
     fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
     dt = control->dt;
     steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+
+    ReAllocate( system, control, data, workspace, lists, mpi_data );
 
     /* velocity verlet, 1st part */
-    for ( i = 0; i < system->n; i++ )
+    if ( verlet_part1_done == FALSE )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-        rvec_Add( atom->x, dx );
-        /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-    }
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            /* Compute x(t + dt) */
+            rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+            rvec_Add( atom->x, dx );
+            /* Compute v(t + dt/2) */
+            rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+        }
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    ReAllocate( system, control, data, workspace, lists, mpi_data );
+        verlet_part1_done = TRUE;
+    }
+
     if ( renbr )
     {
         Update_Grid( system, control, mpi_data->world );
     }
+
     Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
     Reset( system, control, data, workspace, lists );
+
     if ( renbr )
     {
         Generate_Neighbor_Lists( system, data, workspace, lists );
     }
-    Compute_Forces( system, control, data, workspace,
-                    lists, out_control, mpi_data );
 
-    /* velocity verlet, 2nd part */
-    for ( i = 0; i < system->n; i++ )
+    ret = Compute_Forces( system, control, data, workspace,
+            lists, out_control, mpi_data );
+
+    if ( ret == SUCCESS )
     {
-        atom = &(system->my_atoms[i]);
-        inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-        /* Compute v(t + dt) */
-        rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    }
+        /* velocity verlet, 2nd part */
+        for ( i = 0; i < system->n; i++ )
+        {
+            atom = &(system->my_atoms[i]);
+            inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            /* Compute v(t + dt) */
+            rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+        }
 
 #if defined(DEBUG_FOCUS)
-    fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    Compute_Pressure( system, control, data, mpi_data );
-    Scale_Box( system, control, data, mpi_data );
+        Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+        Compute_Pressure( system, control, data, mpi_data );
+        Scale_Box( system, control, data, mpi_data );
+
+        verlet_part1_done = FALSE;
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: scaled box\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
+        fprintf( stderr, "p%d @ step%d: scaled box\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
 #endif
+    }
+
+    return ret;
 }
diff --git a/PG-PuReMD/src/integrate.h b/PG-PuReMD/src/integrate.h
index 8a08d6ddfeda1d4afc05762dcfe75489d66ac263..9a25c761647034226eb1c58abe07cbd7dfce2197 100644
--- a/PG-PuReMD/src/integrate.h
+++ b/PG-PuReMD/src/integrate.h
@@ -24,37 +24,39 @@
 
 #include "reax_types.h"
 
-void Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
-                          storage*, reax_list**, output_controls*,
-                          mpi_datatypes* );
 
-void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
-        simulation_data*, storage*,
-        reax_list**, output_controls*,
-        mpi_datatypes* );
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-void Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
-                                    simulation_data*, storage*,
-                                    reax_list**, output_controls*,
-                                    mpi_datatypes* );
+int Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
 
-void Velocity_Verlet_Berendsen_NPT( reax_system*, control_params*,
-                                    simulation_data*, storage*,
-                                    reax_list**, output_controls*,
-                                    mpi_datatypes* );
+int Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
+int Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
 
-/* void Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*,
-                      simulation_data*, storage*, reax_list**,
-                      output_controls*, mpi_datatypes* );
+int Velocity_Verlet_Berendsen_NPT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
 
-   void Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
+/*
+int Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*,
                    simulation_data*, storage*, reax_list**,
-                   output_controls*, mpi_datatypes* ); */
+                   output_controls*, mpi_datatypes* );
+
+int Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
+                simulation_data*, storage*, reax_list**,
+                output_controls*, mpi_datatypes* );
+*/
+
+#ifdef __cplusplus
+}
+#endif
 
-//CUDA SPECIFIC FUNCTIONS
-void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
-        simulation_data*, storage*,
-        reax_list**, output_controls*,
-        mpi_datatypes* );
 
 #endif
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index 1586b645dda94e568c2e59b3bb15aa3799758f8c..3cce1777cfea5efff1793db2aa1f7019471b9a04 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -20,33 +20,36 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "io_tools.h"
-#include "basic_comm.h"
-#include "list.h"
-#include "reset_tools.h"
-#include "system_props.h"
-#include "tool_box.h"
-#include "traj.h"
-#include "vector.h"
+  #include "io_tools.h"
+  #include "basic_comm.h"
+  #include "list.h"
+  #include "reset_tools.h"
+  #include "system_props.h"
+  #include "tool_box.h"
+  #include "traj.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_io_tools.h"
-#include "reax_basic_comm.h"
-#include "reax_list.h"
-#include "reax_reset_tools.h"
-#include "reax_system_props.h"
-#include "reax_tool_box.h"
-#include "reax_traj.h"
-#include "reax_vector.h"
+  #include "reax_io_tools.h"
+  #include "reax_basic_comm.h"
+  #include "reax_list.h"
+  #include "reax_reset_tools.h"
+  #include "reax_system_props.h"
+  #include "reax_tool_box.h"
+  #include "reax_traj.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
+
 print_interaction Print_Interactions[NUM_INTRS];
 
+
 /************************ initialize output controls ************************/
 int Init_Output_Files( reax_system *system, control_params *control,
-                       output_controls *out_control, mpi_datatypes *mpi_data,
-                       char *msg )
+        output_controls *out_control, mpi_datatypes *mpi_data, char *msg )
 {
     char temp[MAX_STR];
     int ret;
@@ -114,9 +117,9 @@ int Init_Output_Files( reax_system *system, control_params *control,
             sprintf( temp, "%s.log", control->sim_name );
             if ( (out_control->log = fopen( temp, "w" )) != NULL )
             {
-                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s\n",
+                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n",
                          "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
-                         "qeq", "matvecs" );
+                         "charges", "l iters", "retries" );
                 fflush( out_control->log );
             }
             else
@@ -202,7 +205,6 @@ int Init_Output_Files( reax_system *system, control_params *control,
         MPI_Bcast( &(out_control->mol), 1, MPI_LONG, 0, MPI_COMM_WORLD );
     }
 
-
 #ifdef TEST_ENERGY
     /* open bond energy file */
     sprintf( temp, "%s.ebond.%d", control->sim_name, system->my_rank );
@@ -301,7 +303,6 @@ int Init_Output_Files( reax_system *system, control_params *control,
     }
 #endif
 
-
 #ifdef TEST_FORCES
     /* open bond orders file */
     sprintf( temp, "%s.fbo.%d", control->sim_name, system->my_rank );
@@ -470,7 +471,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
 
 /************************ close output files ************************/
 int Close_Output_Files( reax_system *system, control_params *control,
-                        output_controls *out_control, mpi_datatypes *mpi_data )
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     if ( out_control->write_steps > 0 )
         End_Traj( system->my_rank, out_control );
@@ -545,7 +546,6 @@ int Close_Output_Files( reax_system *system, control_params *control,
 }
 
 
-
 void Print_Box( simulation_box* box, char *name, FILE *out )
 {
     // int i, j;
@@ -590,7 +590,6 @@ void Print_Box( simulation_box* box, char *name, FILE *out )
 }
 
 
-
 void Print_Grid( grid* g, FILE *out )
 {
     int x, y, z, gc_type;
@@ -635,25 +634,19 @@ void Print_Grid( grid* g, FILE *out )
     fprintf( out, "\t---------------------------------\n" );
 
     fprintf( stderr, "GCELL MARKS:\n" );
-    //SUDHIR
-    //gc_type = g->cells[0][0][0].type;
-    gc_type = g->cells[ index_grid_3d (0, 0, 0, g) ].type;
+    gc_type = g->cells[ index_grid_3d(0, 0, 0, g) ].type;
     ivec_MakeZero( gc_str );
 
     x = y = z = 0;
     for ( x = 0; x < g->ncells[0]; ++x )
         for ( y = 0; y < g->ncells[1]; ++y )
             for ( z = 0; z < g->ncells[2]; ++z )
-                //SUDHIR
-                //if( g->cells[x][y][z].type != gc_type ){
                 if ( g->cells[ index_grid_3d(x, y, z, g) ].type != gc_type )
                 {
                     fprintf( stderr,
                              "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
                              gc_str[0], gc_str[1], gc_str[2], x, y, z,
                              gc_type, gcell_type_text[gc_type] );
-                    //SUDHIR
-                    //gc_type = g->cells[x][y][z].type;
                     gc_type = g->cells[ index_grid_3d(x, y, z, g) ].type;
                     gc_str[0] = x;
                     gc_str[1] = y;
@@ -666,7 +659,6 @@ void Print_Grid( grid* g, FILE *out )
 }
 
 
-
 void Print_GCell_Exchange_Bounds( int my_rank, neighbor_proc *my_nbrs )
 {
     ivec r;
@@ -709,7 +701,6 @@ void Print_GCell_Exchange_Bounds( int my_rank, neighbor_proc *my_nbrs )
 }
 
 
-
 void Print_Native_GCells( reax_system *system )
 {
     int        i, j, k, l;
@@ -731,8 +722,6 @@ void Print_Native_GCells( reax_system *system )
         for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
             for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
             {
-                //SUDHIR
-                //gc = &( g->cells[i][j][k] );
                 gc = &( g->cells[ index_grid_3d(i, j, k, g) ] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
@@ -752,7 +741,6 @@ void Print_Native_GCells( reax_system *system )
 }
 
 
-
 void Print_All_GCells( reax_system *system )
 {
     int        i, j, k, l;
@@ -774,8 +762,6 @@ void Print_All_GCells( reax_system *system )
         for ( j = 0; j < g->ncells[1]; j++ )
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gc = &( g->cells[i][j][k] );
                 gc = &( g->cells[ index_grid_3d(i, j, k, g) ] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
@@ -795,7 +781,6 @@ void Print_All_GCells( reax_system *system )
 }
 
 
-
 void Print_My_Atoms( reax_system *system )
 {
     int   i;
@@ -805,7 +790,7 @@ void Print_My_Atoms( reax_system *system )
     sprintf( fname, "my_atoms.%d", system->my_rank );
     if ( (fh = fopen( fname, "w" )) == NULL )
     {
-        fprintf( stderr, "error in opening my_atoms file" );
+        fprintf( stderr, "[ERROR] cannot open my_atoms file" );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
@@ -833,7 +818,7 @@ void Print_My_Ext_Atoms( reax_system *system )
     sprintf( fname, "my_ext_atoms.%d", system->my_rank );
     if ( (fh = fopen( fname, "w" )) == NULL )
     {
-        fprintf( stderr, "error in opening my_ext_atoms file" );
+        fprintf( stderr, "[ERROR] cannot open my_ext_atoms file" );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
@@ -853,7 +838,7 @@ void Print_My_Ext_Atoms( reax_system *system )
 
 
 void Print_Far_Neighbors( reax_system *system, reax_list **lists,
-                          control_params *control )
+        control_params *control )
 {
     char  fname[100];
     int   i, j, id_i, id_j, nbr, natoms;
@@ -897,11 +882,15 @@ void Print_Sparse_Matrix( reax_system *system, sparse_matrix *A )
     int i, j;
 
     for ( i = 0; i < A->n; ++i )
+    {
         for ( j = A->start[i]; j < A->end[i]; ++j )
+        {
             fprintf( stderr, "%d %d %.15e\n",
                      system->my_atoms[i].orig_id,
                      system->my_atoms[A->entries[j].j].orig_id,
                      A->entries[j].val );
+        }
+    }
 }
 
 
@@ -911,11 +900,15 @@ void Print_Sparse_Matrix2( reax_system *system, sparse_matrix *A, char *fname )
     FILE *f = fopen( fname, "w" );
 
     for ( i = 0; i < A->n; ++i )
+    {
         for ( j = A->start[i]; j < A->end[i]; ++j )
+        {
             fprintf( f, "%d %d %.15e\n",
                      system->my_atoms[i].orig_id,
                      system->my_atoms[A->entries[j].j].orig_id,
                      A->entries[j].val );
+        }
+    }
 
     fclose(f);
 }
@@ -946,15 +939,15 @@ void Print_Symmetric_Sparse(reax_system *system, sparse_matrix *A, char *fname)
 
 
 void Print_Linear_System( reax_system *system, control_params *control,
-                          storage *workspace, int step )
+        storage *workspace, int step )
 {
-    int   i, j;
-    char  fname[100];
+    int i, j;
+    char fname[100];
     reax_atom *ai, *aj;
     sparse_matrix *H;
     FILE *out;
 
-    // print rhs and init guesses for QEq
+    /* print rhs and init guesses for QEq */
     sprintf( fname, "%s.p%dstate%d", control->sim_name, system->my_rank, step );
     out = fopen( fname, "w" );
     for ( i = 0; i < system->n; i++ )
@@ -967,38 +960,42 @@ void Print_Linear_System( reax_system *system, control_params *control,
     }
     fclose( out );
 
-    // print QEq coef matrix
+    /* print QEq coef matrix */
     sprintf( fname, "%s.p%dH%d", control->sim_name, system->my_rank, step );
     Print_Symmetric_Sparse( system, &workspace->H, fname ); //MATRIX CHANGES
 
-    // print the incomplete H matrix
-    /*sprintf( fname, "%s.p%dHinc%d", control->sim_name, system->my_rank, step );
-    out = fopen( fname, "w" );
-    H = workspace->H;
-    for( i = 0; i < H->n; ++i ) {
-      ai = &(system->my_atoms[i]);
-      for( j = H->start[i]; j < H->end[i]; ++j )
-        if( H->entries[j].j < system->n ) {
-    aj = &(system->my_atoms[H->entries[j].j]);
-    fprintf( out, "%d %d %.15e\n",
-       ai->orig_id, aj->orig_id, H->entries[j].val );
-    if( ai->orig_id != aj->orig_id )
-      fprintf( out, "%d %d %.15e\n",
-         aj->orig_id, ai->orig_id, H->entries[j].val );
-        }
-    }
-    fclose( out );*/
+    /* print the incomplete H matrix */
+//    sprintf( fname, "%s.p%dHinc%d", control->sim_name, system->my_rank, step );
+//    out = fopen( fname, "w" );
+//    H = workspace->H;
+//    for( i = 0; i < H->n; ++i )
+//    {
+//        ai = &(system->my_atoms[i]);
+//        for( j = H->start[i]; j < H->end[i]; ++j )
+//        {
+//            if( H->entries[j].j < system->n )
+//            {
+//                aj = &(system->my_atoms[H->entries[j].j]);
+//                fprintf( out, "%d %d %.15e\n",
+//                ai->orig_id, aj->orig_id, H->entries[j].val );
+//                if( ai->orig_id != aj->orig_id )
+//                    fprintf( out, "%d %d %.15e\n",
+//                aj->orig_id, ai->orig_id, H->entries[j].val );
+//            }
+//        }
+//    }
+//    fclose( out );
 
     // print the L from incomplete cholesky decomposition
-    /*sprintf( fname, "%s.p%dL%d", control->sim_name, system->my_rank, step );
-      Print_Sparse_Matrix2( system, workspace->L, fname );*/
+//    sprintf( fname, "%s.p%dL%d", control->sim_name, system->my_rank, step );
+//    Print_Sparse_Matrix2( system, workspace->L, fname );
 }
 
 
 void Print_LinSys_Soln( reax_system *system, real *x, real *b_prm, real *b )
 {
-    int    i;
-    char   fname[100];
+    int i;
+    char fname[100];
     FILE  *fout;
 
     sprintf( fname, "qeq.%d.out", system->my_rank );
@@ -1032,28 +1029,85 @@ void Print_Charges( reax_system *system )
 }
 
 
-void Print_Bonds( reax_system *system, reax_list *bonds, char *fname )
+void Print_HBonds( reax_system *system, reax_list **lists,
+        control_params *control, int step )
 {
-    int i, j, pj;
+    int i, pj; 
+    char fname[MAX_STR]; 
+    hbond_data *phbond;
+    FILE *fout;
+    reax_list *hbonds = (*lists + HBONDS);
+
+    sprintf( fname, "%s.hbonds.%d.%d", control->sim_name, step, system->my_rank );
+    fout = fopen( fname, "w" );
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        for ( pj = Start_Index(i, hbonds); pj < End_Index(i, hbonds); ++pj )
+        {
+            phbond = &(hbonds->select.hbond_list[pj]);
+
+//            fprintf( fout, "%8d%8d %24.15e %24.15e %24.15e\n", i, phbond->nbr,
+//                    phbond->ptr->dvec[0], phbond->ptr->dvec[1], phbond->ptr->dvec[2] );
+            fprintf( fout, "%8d%8d %8d %8d\n", i, phbond->nbr,
+                  phbond->scl, phbond->sym_index );
+        }
+    }
+
+    fclose( fout );
+}
+
+ 
+void Print_HBond_Indices( reax_system *system, reax_list **lists,
+        control_params *control, int step )
+{
+    int i; 
+    char fname[MAX_STR]; 
+    FILE *fout;
+    reax_list *hbonds = (*lists + HBONDS);
+
+    sprintf( fname, "%s.hbonds_indices.%d.%d", control->sim_name, step, system->my_rank );
+    fout = fopen( fname, "w" );
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        fprintf( fout, "%8d: start: %8d, end: %8d\n",
+                i, Start_Index(i, hbonds), End_Index(i, hbonds) );
+    }
+
+    fclose( fout );
+}
+
+
+void Print_Bonds( reax_system *system, reax_list **lists,
+        control_params *control )
+{
+    int i, pj; 
+    char fname[MAX_STR]; 
     bond_data *pbond;
     bond_order_data *bo_ij;
-    FILE *f = fopen( fname, "w" );
+    FILE *fout;
+    reax_list *bonds = (*lists + BONDS);
+
+    sprintf( fname, "%s.bonds.%d", control->sim_name, system->my_rank );
+    fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             pbond = &(bonds->select.bond_list[pj]);
             bo_ij = &(pbond->bo_data);
-            j = pbond->nbr;
-            //fprintf( f, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-            //       system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-            //       pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
-            fprintf( f, "%8d%8d %24.15f %24.15f\n",
-                     i, j,//system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-                     pbond->d, bo_ij->BO );
+//            fprintf( fout, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+//                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+//                    pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
+            fprintf( fout, "%8d%8d %24.15f %24.15f\n",
+                    i, pbond->nbr, //system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                    pbond->d, bo_ij->BO );
         }
+    }
 
-    fclose(f);
+    fclose( fout );
 }
 
 
@@ -1062,6 +1116,7 @@ int fn_qsort_intcmp( const void *a, const void *b )
     return ( *(int *)a - * (int *)b );
 }
 
+
 void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 {
     int i, j, id_i, id_j, nbr, pj;
@@ -1091,9 +1146,9 @@ void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 
 
 void Print_Total_Force( reax_system *system, simulation_data *data,
-                        storage *workspace )
+        storage *workspace )
 {
-    int    i;
+    int i;
 
     fprintf( stderr, "step: %d\n", data->step );
     fprintf( stderr, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
@@ -1105,18 +1160,19 @@ void Print_Total_Force( reax_system *system, simulation_data *data,
                  workspace->f[i][0], workspace->f[i][1], workspace->f[i][2] );
 }
 
+
 void Output_Results( reax_system *system, control_params *control,
-                     simulation_data *data, reax_list **lists,
-                     output_controls *out_control, mpi_datatypes *mpi_data )
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
 #if defined(LOG_PERFORMANCE)
     real t_elapsed, denom;
 #endif
 
-    if ((out_control->energy_update_freq > 0 &&
+    if ( (out_control->energy_update_freq > 0 &&
             data->step % out_control->energy_update_freq == 0) ||
             (out_control->write_steps > 0 &&
-             data->step % out_control->write_steps == 0))
+             data->step % out_control->write_steps == 0) )
     {
         /* update system-wide energies */
         Compute_System_Energy( system, data, MPI_COMM_WORLD );
@@ -1163,19 +1219,21 @@ void Output_Results( reax_system *system, control_params *control,
 #if defined(LOG_PERFORMANCE)
             t_elapsed = Get_Timing_Info( data->timing.total );
             if ( data->step - data->prev_steps > 0 )
+            {
                 denom = 1.0 / out_control->energy_update_freq;
-            else denom = 1;
+            }
+            else
+            {
+                denom = 1;
+            }
 
-            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%6d\n",
-                     data->step,
-                     t_elapsed * denom,
-                     data->timing.comm * denom,
-                     data->timing.nbrs * denom,
-                     data->timing.init_forces * denom,
-                     data->timing.bonded * denom,
-                     data->timing.nonb * denom,
-                     data->timing.qEq * denom,
-                     (int)((data->timing.s_matvecs + data->timing.t_matvecs)*denom) );
+            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n",
+                    data->step, t_elapsed * denom, data->timing.comm * denom,
+                    data->timing.nbrs * denom, data->timing.init_forces * denom,
+                    data->timing.bonded * denom, data->timing.nonb * denom,
+                    data->timing.cm * denom,
+                    (int)((data->timing.s_matvecs + data->timing.t_matvecs) * denom),
+                    data->timing.num_retries );
 
             Reset_Timing( &(data->timing) );
             fflush( out_control->log );
@@ -1249,6 +1307,7 @@ void Debug_Marker_Bonded( output_controls *out_control, int step )
              "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 }
 
+
 void Debug_Marker_Nonbonded( output_controls *out_control, int step )
 {
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
@@ -1268,7 +1327,6 @@ void Dummy_Printer( reax_system *system, control_params *control,
 }
 
 
-
 void Print_Bond_Orders( reax_system *system, control_params *control,
                         simulation_data *data, storage *workspace,
                         reax_list **lists, output_controls *out_control )
@@ -1429,10 +1487,9 @@ void Print_Force_Files( reax_system *system, control_params *control,
 
 
 #if defined(TEST_FORCES) || defined(TEST_ENERGY)
-
 void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
-                               control_params *control, simulation_data *data,
-                               output_controls *out_control )
+        control_params *control, simulation_data *data,
+        output_controls *out_control )
 {
     int   i, j, id_i, id_j, nbr, natoms;
     int num = 0;
@@ -1466,8 +1523,8 @@ void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
 }
 
 void Print_Bond_List( reax_system *system, control_params *control,
-                      simulation_data *data, reax_list **lists,
-                      output_controls *out_control)
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control)
 {
     int i, j, id_i, id_j, nbr, pj;
     reax_list *bonds = (*lists) + BONDS;
diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h
index a0998ec537c9dbf171fc4dc4a52a99d77dc27ede..f83c9686400a03a70e35a06f777b1a9965e0f02e 100644
--- a/PG-PuReMD/src/io_tools.h
+++ b/PG-PuReMD/src/io_tools.h
@@ -24,43 +24,72 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Init_Output_Files( reax_system*, control_params*,
-                       output_controls*, mpi_datatypes*, char* );
+        output_controls*, mpi_datatypes*, char* );
+
 int Close_Output_Files( reax_system*, control_params*,
-                        output_controls*, mpi_datatypes* );
-
-void  Print_Box( simulation_box*, char*, FILE* );
-
-void  Print_Grid( grid*, FILE* );
-void  Print_GCell_Exchange_Bounds( int, neighbor_proc* );
-void  Print_Native_GCells( reax_system* );
-void  Print_All_GCells( reax_system*);
-
-void  Print_Init_Atoms( reax_system*, storage* );
-void  Print_My_Atoms( reax_system* );
-void  Print_My_Ext_Atoms( reax_system* );
-
-void  Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
-void  Print_Sparse_Matrix( reax_system*, sparse_matrix* );
-void  Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
-void  Print_Linear_System( reax_system*, control_params*, storage*, int );
-void  Print_LinSys_Soln( reax_system*, real*, real*, real* );
-void  Print_Charges( reax_system* );
-void  Print_Bonds( reax_system*, reax_list*, char* );
-void  Print_Bond_List2( reax_system*, reax_list*, char* );
-void  Print_Total_Force( reax_system*, simulation_data*, storage* );
-void  Output_Results( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*, mpi_datatypes* );
+        output_controls*, mpi_datatypes* );
+
+void Print_Box( simulation_box*, char*, FILE* );
+
+void Print_Grid( grid*, FILE* );
+
+void Print_GCell_Exchange_Bounds( int, neighbor_proc* );
+
+void Print_Native_GCells( reax_system* );
+
+void Print_All_GCells( reax_system*);
+
+void Print_Init_Atoms( reax_system*, storage* );
+
+void Print_My_Atoms( reax_system* );
+
+void Print_My_Ext_Atoms( reax_system* );
+
+void Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
+
+void Print_Sparse_Matrix( reax_system*, sparse_matrix* );
+
+void Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
+
+void Print_Linear_System( reax_system*, control_params*, storage*, int );
+
+void Print_LinSys_Soln( reax_system*, real*, real*, real* );
+
+void Print_Charges( reax_system* );
+
+void Print_HBonds( reax_system*, reax_list**, control_params *, int );
+
+void Print_HBond_Indices( reax_system*, reax_list**, control_params *, int );
+
+void Print_Bonds( reax_system*, reax_list**, control_params *);
+
+void Print_Bond_List2( reax_system*, reax_list*, char* );
+
+void Print_Total_Force( reax_system*, simulation_data*, storage* );
+
+void Output_Results( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls*, mpi_datatypes* );
 
 #if defined(DEBUG_FOCUS) || defined(TEST_FORCES) || defined(TEST_ENERGY)
 void Debug_Marker_Bonded( output_controls*, int );
+
 void Debug_Marker_Nonbonded( output_controls*, int );
-void  Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                 simulation_data*, output_controls* );
-void  Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
-void  Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                       reax_list**, output_controls* );
+
+void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Bond_List( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls* );
+
 /*void Dummy_Printer( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Print_Bond_Orders( reax_system*, control_params*, simulation_data*,
@@ -86,22 +115,29 @@ void Print_Total_Force( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Compare_Total_Forces( reax_system*, control_params*, simulation_data*,
 storage*, reax_list**, output_controls* );*/
+
 //void  Print_Total_Force( reax_system*, control_params* );
+
 void Print_Force_Files( reax_system*, control_params*, simulation_data*,
-                        storage*, reax_list**, output_controls*,
-                        mpi_datatypes * );
+        storage*, reax_list**, output_controls*, mpi_datatypes * );
+
 //void Init_Force_Test_Functions( );
 
 int fn_qsort_intcmp( const void *, const void * );
 
 void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                               simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*);
+        reax_list**, output_controls*);
+#endif
 
+#ifdef __cplusplus
+}
 #endif
+
+
 #endif
diff --git a/PG-PuReMD/src/linear_solvers.c b/PG-PuReMD/src/lin_alg.c
similarity index 63%
rename from PG-PuReMD/src/linear_solvers.c
rename to PG-PuReMD/src/lin_alg.c
index 3a8a5d3fb6c209874a397b36fb7a669721e05949..15d8ad96690bf3fd62f4d33e8b7886ebe9ee889b 100644
--- a/PG-PuReMD/src/linear_solvers.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -19,14 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "linear_solvers.h"
+#include "reax_types.h"
+
+#include "lin_alg.h"
+
 #include "basic_comm.h"
 #include "io_tools.h"
 #include "tool_box.h"
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-#include "validation.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 #if defined(CG_PERFORMANCE)
@@ -41,7 +44,8 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 
     for ( i = 0; i < N; ++i )
     {
-        b[i][0] = b[i][1] = 0;
+        b[i][0] = 0;
+        b[i][1] = 0;
     }
 
     /* perform multiplication */
@@ -73,14 +77,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
         simulation_data *data )
 {
-    int  i, j, n, N, matvecs, scale;
+    int i, j, n, N, matvecs, scale;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
     rvec2 sig_old, sig_new;
     MPI_Comm comm;
 
-    int a;
-
     n = system->n;
     N = system->N;
     comm = mpi_data->world;
@@ -97,13 +99,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 #endif
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, N, "x" );
 #endif
 
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, N, "x" );
 #endif
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
@@ -118,7 +120,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         Update_Timing_Info( &t_start, &matvec_time );
 #endif
 
-    for ( j = 0; j < system->n; ++j )
+    for ( j = 0; j < n; ++j )
     {
         /* residual */
         workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
@@ -165,7 +167,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
     for ( i = 1; i < 300; ++i )
     {
         Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
-        //print_host_rvec2 (workspace->d2, N);
+        //print_host_rvec2( workspace->d2, N );
 
         dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
 
@@ -190,7 +192,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
         my_dot[0] = my_dot[1] = 0;
-        for ( j = 0; j < system->n; ++j )
+        for ( j = 0; j < n; ++j )
         {
             /* update x */
             x[j][0] += alpha[0] * workspace->d2[j][0];
@@ -222,7 +224,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
-        for ( j = 0; j < system->n; ++j )
+        for ( j = 0; j < n; ++j )
         {
             /* d = p + beta * d */
             workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
@@ -237,8 +239,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
             workspace->t[j] = workspace->x[j][1];
         }
         matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
-                      mpi_data, fout );
+                mpi_data );
+
+#if defined(DEBUG)
         fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
+#endif
+
         for ( j = 0; j < n; ++j )
         {
             workspace->x[j][1] = workspace->t[j];
@@ -251,9 +257,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
             workspace->s[j] = workspace->x[j][0];
         }
         matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                      mpi_data, fout );
+                mpi_data );
+
+#if defined(DEBUG)
         fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
-        for ( j = 0; j < system->n; ++j )
+#endif
+
+        for ( j = 0; j < n; ++j )
         {
             workspace->x[j][0] = workspace->s[j];
         }
@@ -274,350 +284,6 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
-        simulation_data *data )
-{
-    int  i, j, n, N, matvecs, scale;
-    rvec2 tmp, alpha, beta;
-    rvec2 my_sum, norm_sqr, b_norm, my_dot;
-    rvec2 sig_old, sig_new;
-    MPI_Comm comm;
-    rvec2 *spad = (rvec2 *) host_scratch;
-    int a;
-
-    n = system->n;
-    N = system->N;
-    comm = mpi_data->world;
-    matvecs = 0;
-    scale = sizeof(rvec2) / sizeof(void);
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        matvecs = 0;
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
-    }
-#endif
-
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-//#endif
-
-//  check_zeros_device( x, system->N, "x" );
-
-    get_from_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get" );
-    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-    put_on_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put" );
-
-//  check_zeros_device( x, system->N, "x" );
-
-//  compare_rvec2 (workspace->x, x, N, "x");
-//  if (data->step > 0) {
-//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-//
-//      exit (0);
-//  }
-
-
-//#ifdef __CUDA_DEBUG__
-//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
-//#endif
-    //originally we were using only H->n which was system->n (init_md.c)
-    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
-    
-    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
-
-//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-//  if (data->step > 0) exit (0);
-
-    // tryQEq
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-    
-    get_from_device (spad, dev_workspace->q2, sizeof (rvec2) *
-            system->total_cap, "CG:q2:get" );
-    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
-    put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap,
-            "CG:q2:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-//#ifdef __CUDA_DEBUG__
-//  for( j = 0; j < system->n; ++j ) {
-//    // residual
-//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
-//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
-//    // apply diagonal pre-conditioner
-//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//  }
-//#endif
-    
-    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
-
-//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
-//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
-
-    /* norm of b */
-//#ifdef __CUDA_DEBUG__
-//  my_sum[0] = my_sum[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_sum[0] += SQR( workspace->b[j][0] );
-//    my_sum[1] += SQR( workspace->b[j][1] );
-//  }
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-//#endif
-
-    my_sum[0] = my_sum[1] = 0;
-    Cuda_Norm (b, n, my_sum);
-
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-
-    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
-    b_norm[0] = SQRT( norm_sqr[0] );
-    b_norm[1] = SQRT( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
-
-    /* dot product: r.d */
-//#ifdef __CUDA_DEBUG__
-//  my_dot[0] = my_dot[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-//  }
-//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-    my_dot[0] = my_dot[1] = 0;
-    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
-
-// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-    
-    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300; ++i )
-    {
-        //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
-//#endif
-        
-        get_from_device( spad, dev_workspace->d2, sizeof (rvec2) *
-                system->total_cap, "cg:d2:get" );
-        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-        put_on_device( spad, dev_workspace->d2, sizeof (rvec2) *
-                system->total_cap, "cg:d2:put" );
-
-        //print_device_rvec2 (dev_workspace->d2, N);
-
-//#ifdef __CUDA_DEBUG__
-//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
-//#endif
-        
-        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
-                system->total_cap );
-
-        /*
-        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
-        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
-        print_sparse_matrix_host (&workspace->H);
-        fprintf (stderr, " ******* HOST Vector ***************\n");
-        print_host_rvec2 (workspace->d2, system->N);
-        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
-        print_sparse_matrix (&dev_workspace->H);
-        fprintf (stderr, " ******* Device Vector ***************\n");
-        print_device_rvec2 (dev_workspace->d2, system->N);
-        */
-        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-        // tryQEq
-        // MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-
-        get_from_device( spad, dev_workspace->q2, sizeof (rvec2) *
-                system->total_cap, "cg:q2:get" );
-        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale,
-                rvec2_unpacker );
-        put_on_device( spad, dev_workspace->q2, sizeof (rvec2) *
-                system->total_cap, "cg:q2:put" );
-
-//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        /* dot product: d.q */
-//#ifdef __CUDA_DEBUG__
-//    my_dot[0] = my_dot[1] = 0;
-//    for( j = 0; j < n; ++j ) {
-//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
-
-        alpha[0] = sig_new[0] / tmp[0];
-        alpha[1] = sig_new[1] / tmp[1];
-        my_dot[0] = my_dot[1] = 0;
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // update x
-//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
-//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
-//      // update residual
-//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
-//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-//      // apply diagonal pre-conditioner
-//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//      // dot product: r.p
-//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
-
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
-//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
-//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
-
-        sig_old[0] = sig_new[0];
-        sig_old[1] = sig_new[1];
-        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-
-        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
-        {
-            break;
-        }
-
-        beta[0] = sig_new[0] / sig_old[0];
-        beta[1] = sig_new[1] / sig_old[1];
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // d = p + beta * d
-//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
-//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
-//    }
-//#endif
-
-        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
-                dev_workspace->d2, system->n );
-
-//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
-    }
-
-
-    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->t[j] = workspace->x[j][1];
-        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-
-        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
-
-        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
-        //for( j = 0; j < n; ++j )
-        //  workspace->x[j][1] = workspace->t[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
-    }
-    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->s[j] = workspace->x[j][0];
-
-        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
-
-        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
-        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-
-        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->x[j][0] = workspace->s[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "Dual CG convergence failed! -> %d\n", i );
-    }
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                i + 1, matvecs, matvec_time, dot_time );
-    }
-#endif
-
-    return (i + 1) + matvecs;
-}
-#endif
-
-
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 {
     int  i, j, k, si;
@@ -646,7 +312,7 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 
 
 int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
-        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
@@ -732,150 +398,6 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
-        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
-    real *spad = (real *) host_scratch;
-
-    scale = sizeof(real) / sizeof(void);
-
-    /* x is on the device */
-    //MVAPICH2
-    memset( spad, 0, sizeof (real) * system->total_cap );
-    get_from_device( spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get" );
-    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-
-    //MVAPICH2
-    put_on_device( spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put" );
-    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
-
-    // tryQEq
-    // MVAPICH2
-    get_from_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
-            "cuda_cg:q:get" );
-    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-
-    //MVAPICH2
-    put_on_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
-            "cuda_cg:q:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
-            system->n );
-    //for( j = 0; j < system->n; ++j )
-    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
-    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
-            dev_workspace->Hdia_inv, system->n );
-
-    //TODO do the parallel_norm on the device for the local sum
-    get_from_device( spad, b, sizeof (real) * system->n, "cuda_cg:b:get" );
-    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
-
-    //TODO do the parallel dot on the device for the local sum
-    get_from_device( spad, dev_workspace->r, sizeof (real) * system->total_cap,
-            "cuda_cg:r:get" );
-    get_from_device( spad + system->total_cap, dev_workspace->d, sizeof (real)
-            * system->total_cap, "cuda_cg:d:get" );
-    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
-            mpi_data->world );
-
-    sig0 = sig_new;
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
-    {
-        //MVAPICH2
-        get_from_device( spad, dev_workspace->d, sizeof (real) *
-                system->total_cap, "cuda_cg:d:get" );
-        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-        put_on_device( spad, dev_workspace->d, sizeof (real) *
-                system->total_cap, "cuda_cg:d:put" );
-
-        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
-
-        //tryQEq
-        get_from_device( spad, dev_workspace->q, sizeof (real) *
-                system->total_cap, "cuda_cg:q:get" );
-        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-        put_on_device( spad, dev_workspace->q, sizeof (real) *
-                system->total_cap , "cuda_cg:q:get" );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        //TODO do the parallel dot on the device for the local sum
-        get_from_device( spad, dev_workspace->d, sizeof (real) * system->n,
-                "cuda_cg:d:get" );
-        get_from_device( spad + system->n, dev_workspace->q, sizeof (real) *
-                system->n, "cuda_cg:q:get" );
-        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
-
-        alpha = sig_new / tmp;
-        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
-        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
-
-        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
-                dev_workspace->r, system->n );
-        /* pre-conditioning */
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
-                dev_workspace->Hdia_inv, system->n );
-
-        sig_old = sig_new;
-
-        //TODO do the parallel dot on the device for the local sum
-        get_from_device( spad, dev_workspace->r, sizeof (real) * system->n,
-                "cuda_cg:r:get" );
-        get_from_device( spad + system->n, dev_workspace->p, sizeof (real) *
-                system->n, "cuda_cg:p:get" );
-        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
-        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
-
-        beta = sig_new / sig_old;
-        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
-                dev_workspace->d, system->n );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
-    }
-
-    return i;
-}
-#endif
-
-
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
         *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
@@ -909,7 +431,7 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
 #if defined(DEBUG)
     //if( system->my_rank == MASTER_NODE ) {
     fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
-             system->my_rank, sqrt(sig_new),
+             system->my_rank, SQRT(sig_new),
              Parallel_Norm(workspace->d, system->n, mpi_data->world),
              Parallel_Norm(workspace->q, system->n, mpi_data->world) );
     //Vector_Print( stderr, "d", workspace->d, system->N );
@@ -972,7 +494,7 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
 #if defined(DEBUG)
         if ( system->my_rank == MASTER_NODE )
             fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
-                    system->my_rank, i, sqrt(sig_new) );
+                    system->my_rank, i, SQRT(sig_new) );
         MPI_Barrier( mpi_data->world );
 #endif
 #if defined(CG_PERFORMANCE)
@@ -1239,7 +761,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1268,7 +790,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
         // fprintf( stderr, "%10.6f\n", workspace->g[0] );
 
         /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
+        for ( j = 0; j < RESTART && FABS(workspace->g[j]) / bnorm > tol; j++ )
         {
             /* matvec */
             Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N );
@@ -1315,7 +837,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             workspace->g[j] = tmp1;
             workspace->g[j + 1] = tmp2;
 
-            // fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) );
+            // fprintf( stderr, "%10.6f\n", FABS(workspace->g[j+1]) );
         }
 
         /* solve Hy = g.
@@ -1333,7 +855,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             Vector_Add( x, workspace->y[i], workspace->v[i], N );
 
         /* stopping condition */
-        if ( fabs(workspace->g[j]) / bnorm <= tol )
+        if ( FABS(workspace->g[j]) / bnorm <= tol )
             break;
     }
 
@@ -1347,7 +869,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
       workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
     fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+             itr, j, FABS( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
@@ -1360,8 +882,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES_HouseHolder( reax_system *system, storage *workspace,
-                       sparse_matrix *H, real *b, real tol, real *x,
-                       mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1397,7 +919,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
         // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
         /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+        for ( j = 0; j < RESTART && FABS( w[j] ) / bnorm > tol; j++ )
         {
             /* compute v_j */
             Vector_Scale( z[j], -2 * u[j][j], u[j], N );
@@ -1446,7 +968,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
             }
 
             /* apply the new Givens rotation to H and right-hand side */
-            if ( fabs(v[j + 1]) >= ALMOST_ZERO )
+            if ( FABS(v[j + 1]) >= ALMOST_ZERO )
             {
                 cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
                 workspace->hc[j] = v[j] / cc;
@@ -1493,7 +1015,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
             Vector_Add( x, workspace->y[i], z[i], N );
 
         /* stopping condition */
-        if ( fabs( w[j] ) / bnorm <= tol )
+        if ( FABS( w[j] ) / bnorm <= tol )
             break;
     }
 
@@ -1507,7 +1029,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
     //          workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
     fprintf( fout, "GMRES outer:%d  inner:%d iters, |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+             itr, j, FABS( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
diff --git a/PG-PuReMD/src/linear_solvers.h b/PG-PuReMD/src/lin_alg.h
similarity index 66%
rename from PG-PuReMD/src/linear_solvers.h
rename to PG-PuReMD/src/lin_alg.h
index f16491546abcb71d23f186b8285dce717230c641..4b7ba2f0368e81d498f93d91e33db074ebddd9ca 100644
--- a/PG-PuReMD/src/linear_solvers.h
+++ b/PG-PuReMD/src/lin_alg.h
@@ -19,28 +19,37 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __LINEAR_SOLVERS_H_
-#define __LINEAR_SOLVERS_H_
+#ifndef __LIN_ALG_H_
+#define __LIN_ALG_H_
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int GMRES( reax_system*, storage*, sparse_matrix*,
-           real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
-                       real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int dual_CG( reax_system*, storage*, sparse_matrix*,
-             rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
+
 int CG( reax_system*, storage*, sparse_matrix*,
-        real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes* );
+
 int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+        sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+
 int sCG( reax_system*, storage*, sparse_matrix*,
-         real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
+#ifdef __cplusplus
+}
+#endif
 
-//CUDA Functions
-int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
-                  rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
-int Cuda_CG( reax_system*, storage*, sparse_matrix*,
-             real*, real, real*, mpi_datatypes*, FILE* );
 
 #endif
diff --git a/PG-PuReMD/src/list.c b/PG-PuReMD/src/list.c
index 695e84231910e5c986d0b8f99df3247de2b9bdd6..69736afbbee10a1fdd67d1399166613abaa21112 100644
--- a/PG-PuReMD/src/list.c
+++ b/PG-PuReMD/src/list.c
@@ -22,15 +22,15 @@
 #include "reax_types.h"
 
 #if defined(PURE_REAX)
-#include "list.h"
-#include "tool_box.h"
+  #include "list.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_list.h"
-#include "reax_tool_box.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
 #endif
 
 
-void Print_List(reax_list* list)
+void Print_List( reax_list* list )
 {
     //printf("List_Print\n");
     int i;
@@ -50,99 +50,77 @@ void Print_List(reax_list* list)
 }
 
 
-/************* allocate list space ******************/
-int Make_List(int n, int num_intrs, int type, reax_list *l)
+/* allocate space for interaction list
+ *
+ * n: num. of elements to be allocated for list
+ * num_intrs:
+ * type:
+ * l:
+ * */
+void Make_List( int n, int num_intrs, int type, reax_list *l )
 {
-    int ret = SUCCESS;
-
-    l->allocated = 1;
+    l->allocated = TRUE;
     l->n = n;
     l->num_intrs = num_intrs;
-
-    if( (l->index = (int*) smalloc( n * sizeof(int), "list:index" )) == NULL
-        ||  (l->end_index = (int*) smalloc( n * sizeof(int), "list:end_index" )) == NULL )
-    {
-        ret = FAILURE;
-    }
-
+    l->index = (int*) smalloc( n * sizeof(int), "list:index" );
+    l->end_index = (int*) smalloc( n * sizeof(int), "list:index" );
     l->type = type;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type );
 #endif
 
-    switch (l->type)
+    switch ( l->type )
     {
     case TYP_VOID:
-        if( (l->select.v = (void*) smalloc( l->num_intrs * sizeof(void*), "list:v" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.v = (void*)
+                smalloc( l->num_intrs * sizeof(void*), "list:v" );
         break;
 
     case TYP_THREE_BODY:
-        if( (l->select.three_body_list = (three_body_interaction_data*)
-            smalloc( l->num_intrs * sizeof(three_body_interaction_data),
-            "list:three_bodies" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.three_body_list = (three_body_interaction_data*)
+                smalloc( l->num_intrs * sizeof(three_body_interaction_data), "list:three_bodies" );
         break;
 
     case TYP_BOND:
-        if( (l->select.bond_list = (bond_data*)
-            smalloc( l->num_intrs * sizeof(bond_data), "list:bonds" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.bond_list = (bond_data*)
+                smalloc( l->num_intrs * sizeof(bond_data), "list:bonds" );
         break;
 
     case TYP_DBO:
-        if( (l->select.dbo_list = (dbond_data*)
-            smalloc( l->num_intrs * sizeof(dbond_data), "list:dbonds" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.dbo_list = (dbond_data*)
+                smalloc( l->num_intrs * sizeof(dbond_data), "list:dbonds" );
         break;
 
     case TYP_DDELTA:
-        if( (l->select.dDelta_list = (dDelta_data*)
-            smalloc( l->num_intrs * sizeof(dDelta_data), "list:dDeltas" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.dDelta_list = (dDelta_data*)
+                smalloc( l->num_intrs * sizeof(dDelta_data), "list:dDeltas" );
         break;
 
     case TYP_FAR_NEIGHBOR:
-        if( (l->select.far_nbr_list = (far_neighbor_data*)
-            smalloc( l->num_intrs * sizeof(far_neighbor_data), "list:far_nbrs" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.far_nbr_list = (far_neighbor_data*)
+                smalloc( l->num_intrs * sizeof(far_neighbor_data), "list:far_nbrs" );
         break;
 
     case TYP_HBOND:
-        if( (l->select.hbond_list = (hbond_data*)
-            smalloc( l->num_intrs * sizeof(hbond_data), "list:hbonds" )) == NULL )
-        {
-            ret = FAILURE;
-        }
+        l->select.hbond_list = (hbond_data*)
+                smalloc( l->num_intrs * sizeof(hbond_data), "list:hbonds" );
         break;
 
     default:
-        fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
+        fprintf( stderr, "[ERROR] no %d list type defined!\n", l->type );
         MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
     }
-
-    return ret;
 }
 
 
-void Delete_List( reax_list *l)
+void Delete_List( reax_list *l )
 {
-    if ( l->allocated == 0 )
+    if ( l->allocated == FALSE )
+    {
         return;
-    l->allocated = 0;
+    }
+    l->allocated = FALSE;
 
     sfree( l->index, "list:index" );
     sfree( l->end_index, "list:end_index" );
@@ -152,55 +130,33 @@ void Delete_List( reax_list *l)
     case TYP_VOID:
         sfree( l->select.v, "list:v" );
         break;
+
     case TYP_HBOND:
         sfree( l->select.hbond_list, "list:hbonds" );
         break;
+
     case TYP_FAR_NEIGHBOR:
         sfree( l->select.far_nbr_list, "list:far_nbrs" );
         break;
+
     case TYP_BOND:
         sfree( l->select.bond_list, "list:bonds" );
         break;
+
     case TYP_DBO:
         sfree( l->select.dbo_list, "list:dbos" );
         break;
+
     case TYP_DDELTA:
         sfree( l->select.dDelta_list, "list:dDeltas" );
         break;
+
     case TYP_THREE_BODY:
         sfree( l->select.three_body_list, "list:three_bodies" );
         break;
 
     default:
-        fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
+        fprintf( stderr, "[ERROR] no %d list type defined!\n", l->type );
         MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
     }
 }
-
-
-#if defined(SUDHIR)
-inline int Num_Entries( int i, reax_list *l )
-{
-    return l->end_index[i] - l->index[i];
-}
-
-inline int Start_Index( int i, reax_list *l )
-{
-    return l->index[i];
-}
-
-inline int End_Index( int i, reax_list *l )
-{
-    return l->end_index[i];
-}
-
-inline void Set_Start_Index( int i, int val, reax_list *l )
-{
-    l->index[i] = val;
-}
-
-inline void Set_End_Index( int i, int val, reax_list *l )
-{
-    l->end_index[i] = val;
-}
-#endif
diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h
index 630958d0736490eef90948ddd12a3c2e6d0de81a..a6a82d204a9c6409553cbd8a4e4ab3a7992959a0 100644
--- a/PG-PuReMD/src/list.h
+++ b/PG-PuReMD/src/list.h
@@ -24,23 +24,21 @@
 
 #include "reax_types.h"
 
-#ifdef _cplusplus
+
+#ifdef __cplusplus
 extern "C" {
 #endif
-    void Print_List(reax_list*);
-    int  Make_List( int, int, int, reax_list*);
-    void Delete_List( reax_list*);
 
-#ifdef _cplusplus
+void Print_List( reax_list* );
+
+void Make_List( int, int, int, reax_list* );
+
+void Delete_List( reax_list* );
+
+#ifdef __cplusplus
 }
 #endif
 
-static inline int  Num_Entries(int, reax_list*);
-static inline int  Start_Index( int, reax_list* );
-static inline int  End_Index( int, reax_list* );
-static inline void Set_Start_Index(int, int, reax_list*);
-static inline void Set_End_Index(int, int, reax_list*);
-
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
 static inline int Num_Entries( int i, reax_list *l )
 {
@@ -66,6 +64,7 @@ static inline void Set_End_Index( int i, int val, reax_list *l )
 {
     l->end_index[i] = val;
 }
-#endif // LAMMPS_REAX
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/lookup.c b/PG-PuReMD/src/lookup.c
index 26e6e2220bebbdc577167b30082577d67c0e0591..b071ea89cf94a895221862043feada0fbf928b13 100644
--- a/PG-PuReMD/src/lookup.c
+++ b/PG-PuReMD/src/lookup.c
@@ -21,25 +21,26 @@
 
 #include "reax_types.h"
 
+#if defined(PURE_REAX)
+  #include "lookup.h"
+  #include "nonbonded.h"
+  #include "tool_box.h"
+#elif defined(LAMMPS_REAX)
+  #include "reax_lookup.h"
+  #include "reax_nonbonded.h"
+  #include "reax_tool_box.h"
+#endif
+
 #include "index_utils.h"
 
 #ifdef HAVE_CUDA
-#include "cuda_lookup.h"
+  #include "cuda/cuda_lookup.h"
 #endif
 
-#if defined(PURE_REAX)
-#include "lookup.h"
-#include "nonbonded.h"
-#include "tool_box.h"
-#elif defined(LAMMPS_REAX)
-#include "reax_lookup.h"
-#include "reax_nonbonded.h"
-#include "reax_tool_box.h"
-#endif
 
 /* Fills solution into x. Warning: will modify c and d! */
 void Tridiagonal_Solve( const real *a, const real *b,
-                        real *c, real *d, real *x, unsigned int n)
+        real *c, real *d, real *x, unsigned int n )
 {
     int i;
     real id;
@@ -62,17 +63,17 @@ void Tridiagonal_Solve( const real *a, const real *b,
 
 
 void Natural_Cubic_Spline( const real *h, const real *f,
-                           cubic_spline_coef *coef, unsigned int n )
+        cubic_spline_coef *coef, unsigned int n )
 {
     int i;
     real *a, *b, *c, *d, *v;
 
     /* allocate space for the linear system */
     a = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    b = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    c = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    d = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    v = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
+    b = (real*) smalloc( n * sizeof(real), "cubic_spline:b" );
+    c = (real*) smalloc( n * sizeof(real), "cubic_spline:c" );
+    d = (real*) smalloc( n * sizeof(real), "cubic_spline:d" );
+    v = (real*) smalloc( n * sizeof(real), "cubic_spline:v" );
 
     /* build the linear system */
     a[0] = a[1] = a[n - 1] = 0;
@@ -111,37 +112,44 @@ void Natural_Cubic_Spline( const real *h, const real *f,
 }
 
 
-
 void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
-                            cubic_spline_coef *coef, unsigned int n )
+        cubic_spline_coef *coef, unsigned int n )
 {
     int i;
     real *a, *b, *c, *d, *v;
 
     /* allocate space for the linear system */
     a = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    b = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    c = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    d = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
-    v = (real*) smalloc( n * sizeof(real), "cubic_spline:a" );
+    b = (real*) smalloc( n * sizeof(real), "cubic_spline:b" );
+    c = (real*) smalloc( n * sizeof(real), "cubic_spline:c" );
+    d = (real*) smalloc( n * sizeof(real), "cubic_spline:d" );
+    v = (real*) smalloc( n * sizeof(real), "cubic_spline:v" );
 
     /* build the linear system */
     a[0] = 0;
     for ( i = 1; i < n; ++i )
+    {
         a[i] = h[i - 1];
+    }
 
     b[0] = 2 * h[0];
     for ( i = 1; i < n; ++i )
+    {
         b[i] = 2 * (h[i - 1] + h[i]);
+    }
 
     c[n - 1] = 0;
     for ( i = 0; i < n - 1; ++i )
+    {
         c[i] = h[i];
+    }
 
     d[0] = 6 * (f[1] - f[0]) / h[0] - 6 * v0;
     d[n - 1] = 6 * vlast - 6 * (f[n - 1] - f[n - 2] / h[n - 2]);
     for ( i = 1; i < n - 1; ++i )
+    {
         d[i] = 6 * ((f[i + 1] - f[i]) / h[i] - (f[i] - f[i - 1]) / h[i - 1]);
+    }
 
     Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
 
@@ -186,7 +194,7 @@ void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
 
 
 int Init_Lookup_Tables( reax_system *system, control_params *control,
-                        real *Tap, mpi_datatypes *mpi_data, char *msg )
+        real *Tap, mpi_datatypes *mpi_data, char *msg )
 {
     int i, j, r;
     int num_atom_types;
@@ -211,56 +219,57 @@ int Init_Lookup_Tables( reax_system *system, control_params *control,
     fCEvd = (real*) smalloc((control->tabulate + 1) * sizeof(real), "lookup:fCEvd");
     fele = (real*) smalloc( (control->tabulate + 1) * sizeof(real), "lookup:fele" );
     fCEclmb = (real*) smalloc( (control->tabulate + 1) * sizeof(real),
-                               "lookup:fCEclmb" );
+            "lookup:fCEclmb" );
 
     /* allocate Long-Range LookUp Table space based on
        number of atom types in the ffield file */
-    //SUDHIR
-    /*
-    LR = (LR_lookup_table**)
-    smalloc( num_atom_types * sizeof(LR_lookup_table*), "lookup:LR" );
-    for( i = 0; i < num_atom_types; ++i )
-    LR[i] = (LR_lookup_table*)
-    smalloc(num_atom_types * sizeof(LR_lookup_table), "lookup:LR[i]");
-    */
-    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
+    LR = (LR_lookup_table*) smalloc(
+            num_atom_types * num_atom_types * sizeof(LR_lookup_table),
+            "Init_Lookup_Tables::LR" );
 
     /* most atom types in ffield file will not exist in the current
        simulation. to avoid unnecessary lookup table space, determine
        the atom types that exist in the current simulation */
     for ( i = 0; i < MAX_ATOM_TYPES; ++i )
+    {
         existing_types[i] = 0;
+    }
     for ( i = 0; i < system->n; ++i )
+    {
         existing_types[ system->my_atoms[i].type ] = 1;
+    }
 
     MPI_Allreduce( existing_types, aggregated, MAX_ATOM_TYPES,
-                   MPI_INT, MPI_SUM, mpi_data->world );
+            MPI_INT, MPI_SUM, mpi_data->world );
 
     /* fill in the lookup table entries for existing atom types.
        only lower half should be enough. */
     for ( i = 0; i < num_atom_types; ++i )
+    {
         if ( aggregated[i] )
+        {
             for ( j = i; j < num_atom_types; ++j )
+            {
                 if ( aggregated[j] )
                 {
 
-                    LR[ index_lr (i, j, num_atom_types) ].xmin = 0;
-                    LR[ index_lr (i, j, num_atom_types) ].xmax = control->nonb_cut;
-                    LR[ index_lr (i, j, num_atom_types) ].n = control->tabulate + 1;
-                    LR[ index_lr (i, j, num_atom_types) ].dx = dr;
-                    LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->nonb_cut;
-                    LR[ index_lr (i, j, num_atom_types) ].y = (LR_data*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(LR_data), "lookup:LR[i,j].y");
-                    LR[ index_lr (i, j, num_atom_types) ].H = (cubic_spline_coef*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].H");
-                    LR[ index_lr (i, j, num_atom_types) ].vdW = (cubic_spline_coef*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].vdW");
-                    LR[ index_lr (i, j, num_atom_types) ].CEvd = (cubic_spline_coef*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].CEvd");
-                    LR[ index_lr (i, j, num_atom_types) ].ele = (cubic_spline_coef*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].ele");
-                    LR[ index_lr (i, j, num_atom_types) ].CEclmb = (cubic_spline_coef*)
-                            smalloc(LR[ index_lr (i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].CEclmb");
+                    LR[ index_lr(i, j, num_atom_types) ].xmin = 0;
+                    LR[ index_lr(i, j, num_atom_types) ].xmax = control->nonb_cut;
+                    LR[ index_lr(i, j, num_atom_types) ].n = control->tabulate + 1;
+                    LR[ index_lr(i, j, num_atom_types) ].dx = dr;
+                    LR[ index_lr(i, j, num_atom_types) ].inv_dx = control->tabulate / control->nonb_cut;
+                    LR[ index_lr(i, j, num_atom_types) ].y = (LR_data*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(LR_data), "lookup:LR[i,j].y");
+                    LR[ index_lr(i, j, num_atom_types) ].H = (cubic_spline_coef*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].H");
+                    LR[ index_lr(i, j, num_atom_types) ].vdW = (cubic_spline_coef*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].vdW");
+                    LR[ index_lr(i, j, num_atom_types) ].CEvd = (cubic_spline_coef*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].CEvd");
+                    LR[ index_lr(i, j, num_atom_types) ].ele = (cubic_spline_coef*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].ele");
+                    LR[ index_lr(i, j, num_atom_types) ].CEclmb = (cubic_spline_coef*)
+                            smalloc(LR[ index_lr(i, j, num_atom_types) ].n * sizeof(cubic_spline_coef), "lookup:LR[i,j].CEclmb");
 
                     for ( r = 1; r <= control->tabulate; ++r )
                     {
@@ -285,40 +294,44 @@ int Init_Lookup_Tables( reax_system *system, control_params *control,
                     }
 
                     Natural_Cubic_Spline( &h[1], &fh[1],
-                                          &(LR[ index_lr (i, j, num_atom_types) ].H[1]), control->tabulate + 1 );
+                            &(LR[ index_lr (i, j, num_atom_types) ].H[1]), control->tabulate + 1 );
 
                     Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw,
-                                           &(LR[ index_lr (i, j, num_atom_types) ].vdW[1]), control->tabulate + 1 );
+                            &(LR[ index_lr (i, j, num_atom_types) ].vdW[1]), control->tabulate + 1 );
                     Natural_Cubic_Spline( &h[1], &fCEvd[1],
-                                          &(LR[ index_lr (i, j, num_atom_types) ].CEvd[1]), control->tabulate + 1 );
+                            &(LR[ index_lr (i, j, num_atom_types) ].CEvd[1]), control->tabulate + 1 );
 
                     Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele,
-                                           &(LR[ index_lr (i, j, num_atom_types) ].ele[1]), control->tabulate + 1 );
+                            &(LR[ index_lr (i, j, num_atom_types) ].ele[1]), control->tabulate + 1 );
                     Natural_Cubic_Spline( &h[1], &fCEclmb[1],
-                                          &(LR[ index_lr (i, j, num_atom_types) ].CEclmb[1]), control->tabulate + 1 );
+                            &(LR[ index_lr (i, j, num_atom_types) ].CEclmb[1]), control->tabulate + 1 );
                 }
+            }
+        }
+    }
 
-    free(h);
-    free(fh);
-    free(fvdw);
-    free(fCEvd);
-    free(fele);
-    free(fCEclmb);
+    sfree( h, "Init_Lookup_Tables::h" );
+    sfree( fh, "Init_Lookup_Tables::fh" );
+    sfree( fvdw, "Init_Lookup_Tables::fvdw" );
+    sfree( fCEvd, "Init_Lookup_Tables::fCEvd" );
+    sfree( fele, "Init_Lookup_Tables::fele" );
+    sfree( fCEclmb, "Init_Lookup_Tables::fCEclmb" );
 
 #ifdef HAVE_CUDA
     //copy the LR_Table to the device here.
-    t_start = Get_Time ();
-    copy_LR_table_to_device (system, control, aggregated);
-    t_end = Get_Timing_Info ( t_start );
+    t_start = Get_Time( );
+    copy_LR_table_to_device( system, control, aggregated );
+    t_end = Get_Timing_Info( t_start );
 
-    fprintf (stderr, "Device copy of LR Lookup table: %f \n", t_end );
+    fprintf( stderr, "Device copy of LR Lookup table: %f \n", t_end );
 #endif
 
-    return 1;
+    return SUCCESS;
 }
 
+
 /*
-void copy_LR_table_to_device (reax_system *system, control_params *control, int aggregated)
+void copy_LR_table_to_device( reax_system *system, control_params *control, int aggregated )
 {
   int i, j, r;
   int num_atom_types;
@@ -329,7 +342,7 @@ void copy_LR_table_to_device (reax_system *system, control_params *control, int
 
   fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
 
-  cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table");
+  cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), FALSE, "LR_lookup:table");
 
   for( i = 0; i < MAX_ATOM_TYPES; ++i )
     existing_types[i] = 0;
@@ -346,37 +359,37 @@ void copy_LR_table_to_device (reax_system *system, control_params *control, int
 
          if( aggregated [j] ) {
 
-            cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y");
+            cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), FALSE, "LR_lookup:d_y");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y,
                     sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y");
             copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y,
                     sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y");
 
-            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h");
+            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:h");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp,
                     sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h");
             copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H,
                     sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h");
 
-            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW");
+            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:vdW");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp,
                     sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW");
             copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,
                     sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW");
 
-            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd");
+            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:CEvd");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp,
                     sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd");
             copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd,
                     sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
 
-            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele");
+            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ele");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp,
                     sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele");
             copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele,
                     sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele");
 
-            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb");
+            cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ceclmb");
             copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp,
                     sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
             copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb,
diff --git a/PG-PuReMD/src/lookup.h b/PG-PuReMD/src/lookup.h
index f6e45bd17d6b3eb3bd0f723c9888244b5767e686..4db34ce0234f86fed309f51909ec8e4ca070c37c 100644
--- a/PG-PuReMD/src/lookup.h
+++ b/PG-PuReMD/src/lookup.h
@@ -26,7 +26,17 @@
 
 //extern LR_lookup_table **LR;
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 int Init_Lookup_Tables( reax_system*, control_params*, real *,
-                        mpi_datatypes*, char* );
+        mpi_datatypes*, char* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/matvec.cu b/PG-PuReMD/src/matvec.cu
deleted file mode 100644
index dcde4165643addc7a8b70f9f911ed5ad964bd95f..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/matvec.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-
-
-#include "matvec.h"
-#include "cuda_shuffle.h"
-
-//one thread per row
-CUDA_GLOBAL void k_matvec (sparse_matrix H, real *vec, real *results, int rows)
-{
-    real results_row = 0;
-    int col;
-    real val;
-
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= rows) return;
-
-    for (int c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row += val * vec [col];
-    }
-
-    results [i] = results_row;
-}
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
-CUDA_GLOBAL void k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
-{
-#if defined(__SM_35__)
-    real vals;
-#else
-    extern __shared__ real vals [];
-#endif
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
-
-    int row_start;
-    int row_end;
-
-    // one warp per row
-    //int row = warp_id;
-    int row = warp_id;
-    //if (row < num_rows)
-    {
-#if defined(__SM_35__)
-        vals = 0;
-#else
-        vals[threadIdx.x] = 0;
-#endif
-
-        if (row < num_rows) {
-            row_start = H.start[row];
-            row_end = H.end[row];
-
-            // compute running sum per thread
-            for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
-#if defined(__SM_35__)
-                vals += H.entries[jj].val * vec [ H.entries[jj].j ];
-        }
-#else
-        vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
-    }
-    __syncthreads ();
-#endif
-
-    // parallel reduction in shared memory
-    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-#if defined(__SM_35__)
-    for (int x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
-        vals += shfl( vals, x );
-
-    if (lane == 0 && row < num_rows)
-        results[row] = vals;
-#else
-    if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
-    if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
-    if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
-    if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
-    if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
-
-    // first thread writes the result
-    if (lane == 0 && row < num_rows)
-        results[row] = vals[threadIdx.x];
-#endif
-}
-}
diff --git a/PG-PuReMD/src/matvec.h b/PG-PuReMD/src/matvec.h
deleted file mode 100644
index 680abea7dc26be6510392a832a6fd7414f596281..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/matvec.h
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-#ifndef __MATVEC__H_
-#define __MATVEC__H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void k_matvec (sparse_matrix , real *, real *, int );
-CUDA_GLOBAL void k_matvec_csr(sparse_matrix , real *, real *, int );
-
-#endif
diff --git a/PG-PuReMD/src/multi_body.c b/PG-PuReMD/src/multi_body.c
index bf36ca720329c6486debdb1359d5269f8c4a07eb..aab4957d0ae3923f8278f91f88ffc49eb0180b2d 100644
--- a/PG-PuReMD/src/multi_body.c
+++ b/PG-PuReMD/src/multi_body.c
@@ -20,23 +20,25 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "multi_body.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "vector.h"
+  #include "multi_body.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_multi_body.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_vector.h"
+  #include "reax_multi_body.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Atom_Energy( reax_system *system, control_params *control,
-                  simulation_data *data, storage *workspace, reax_list **lists,
-                  output_controls *out_control )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control )
 {
     int i, j, pj, type_i, type_j;
     real Delta_lpcorr, dfvl;
@@ -140,7 +142,6 @@ void Atom_Energy( reax_system *system, control_params *control,
                 }
     }
 
-
     for ( i = 0; i < system->n; ++i )
     {
         type_i = system->my_atoms[i].type;
diff --git a/PG-PuReMD/src/multi_body.h b/PG-PuReMD/src/multi_body.h
index aaed59e559d21760354544453286202a6deaef38..9cc865b4b1601f1efc4f0cb92006733025b2c10e 100644
--- a/PG-PuReMD/src/multi_body.h
+++ b/PG-PuReMD/src/multi_body.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void Atom_Energy( reax_system*, control_params*, simulation_data*,
-                  storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/neighbors.c b/PG-PuReMD/src/neighbors.c
index 01ed2482f341b3fb1bb580031083edbdb7c6364a..732a6e7da4bdbcb422308246db9916d66870385c 100644
--- a/PG-PuReMD/src/neighbors.c
+++ b/PG-PuReMD/src/neighbors.c
@@ -19,14 +19,16 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "neighbors.h"
+
+#include "index_utils.h"
 #include "io_tools.h"
 #include "list.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 int compare_far_nbrs( const void *p1, const void *p2 )
 {
@@ -35,11 +37,11 @@ int compare_far_nbrs( const void *p1, const void *p2 )
 
 
 void Draw_Near_Neighbor_Box( reax_system *system, control_params *control,
-                             storage *workspace )
+        storage *workspace )
 {
-    int  i;
-    reax_atom       *atom;
-    simulation_box  *my_box;
+    int i;
+    reax_atom *atom;
+    simulation_box *my_box;
     boundary_cutoff *bc;
 
     my_box = &( system->my_box );
@@ -47,7 +49,9 @@ void Draw_Near_Neighbor_Box( reax_system *system, control_params *control,
 
     /* all native atoms are within near neighbor skin */
     for ( i = 0; i < system->n; ++i )
+    {
         workspace->within_bond_box[i] = 1;
+    }
 
     /* loop over imported atoms */
     for ( i = system->n; i < system->N; ++i )
@@ -60,15 +64,19 @@ void Draw_Near_Neighbor_Box( reax_system *system, control_params *control,
                 atom->x[1] <= my_box->max[1] + bc->ghost_bond &&
                 my_box->min[2] - bc->ghost_bond <= atom->x[2] &&
                 atom->x[2] <= my_box->max[2] + bc->ghost_bond )
+        {
             workspace->within_bond_box[i] = 1;
+        }
         else
+        {
             workspace->within_bond_box[i] = 0;
+        }
     }
 }
 
 
 void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
-                              storage *workspace, reax_list **lists )
+        storage *workspace, reax_list **lists )
 {
     int  i, j, k, l, m, itr, num_far;
     real d, cutoff;
@@ -84,49 +92,48 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
     real t_start = 0, t_elapsed = 0;
 
     if ( system->my_rank == MASTER_NODE )
+    {
         t_start = Get_Time( );
+    }
 #endif
 
-    // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->my_grid );
     far_nbrs = (*lists) + FAR_NBRS;
     num_far = 0;
 
     /* first pick up a cell in the grid */
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gci = &(g->cells[i][j][k]);
                 gci = &(g->cells[ index_grid_3d(i, j, k, g) ]);
-                //cutoff = SQR(gci->cutoff);
                 cutoff = SQR(g->cutoff[index_grid_3d(i, j, k, g)]);
-                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for (l = g->str[index_grid_3d(i, j, k, g)]; l < g->end[index_grid_3d(i, j, k, g)]; ++l )
+                for ( l = g->str[index_grid_3d(i, j, k, g)]; l < g->end[index_grid_3d(i, j, k, g)]; ++l )
                 {
                     atom1 = &(system->my_atoms[l]);
                     Set_Start_Index( l, num_far, far_nbrs );
-                    //fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
-                    //while( (gcj=gci->nbrs[itr]) != NULL ) {
+                    /* search through neighbor grid cell candidates of current cell */
                     while ( (g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)][0]) >= 0 )
                     {
 
-                        ivec_Copy (nbrs_x, g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)]);
-                        gcj = &( g->cells [ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ] );
+                        ivec_Copy( nbrs_x, g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)] );
+                        gcj = &( g->cells[ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ] );
 
                         if ( g->str[index_grid_3d(i, j, k, g)] <= g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)] &&
                                 (DistSqr_to_Special_Point(g->nbrs_cp[index_grid_nbrs(i, j, k, itr, g)], atom1->x) <= cutoff) )
-                            /* pick up another atom from the neighbor cell */
-                            //for( m = gcj->str; m < gcj->end; ++m )
+                        {
+                            /* pick up another atom from the neighbor grid cell */
                             for ( m = g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)];
                                     m < g->end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)]; ++m )
                             {
-                                if ( l < m )  // prevent recounting same pairs within a gcell
+                                /* prevent recounting same pairs within a gcell */
+                                if ( l < m )
                                 {
                                     atom2 = &(system->my_atoms[m]);
                                     dvec[0] = atom2->x[0] - atom1->x[0];
@@ -137,28 +144,33 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
                                     {
                                         nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
                                         nbr_data->nbr = m;
-                                        nbr_data->d = SQRT(d);
+                                        nbr_data->d = SQRT( d );
                                         rvec_Copy( nbr_data->dvec, dvec );
                                         //ivec_Copy( nbr_data->rel_box, gcj->rel_box );
                                         //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
-                                        ivec_ScaledSum( nbr_data->rel_box, 1, g->rel_box[ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ],
-                                                        -1, g->rel_box[index_grid_3d (i, j, k, g)] );
+                                        ivec_ScaledSum( nbr_data->rel_box, 1,
+                                                g->rel_box[ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ],
+                                                -1, g->rel_box[index_grid_3d(i, j, k, g)] );
                                         ++num_far;
                                     }
                                 }
                             }
+                        }
+
                         ++itr;
                     }
+
                     Set_End_Index( l, num_far, far_nbrs );
-                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n",
-                    //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-                    //  itr);
                 }
             }
+        }
+    }
 
+#if defined(DEBUG)
     fprintf (stderr, " HOST NEIGHBOR COUNT: %d \n", num_far );
+#endif
 
-    workspace->realloc.num_far = num_far;
+    workspace->realloc.far_nbrs = num_far;
 
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
@@ -176,9 +188,11 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
 
 #if defined(TEST_ENERGY) || defined(TEST_FORCES)
     for ( i = 0; i < system->N; ++i )
+    {
         qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
                compare_far_nbrs );
+    }
 #endif
 }
 
@@ -199,38 +213,44 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
 
     /* first pick up a cell in the grid */
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gci = &(g->cells[i][j][k]);
-                gci = &(g->cells[ index_grid_3d (i, j, k, g) ]);
-                //cutoff = SQR(gci->cutoff);
-                cutoff = SQR(g->cutoff [index_grid_3d (i, j, k, g)]);
+                gci = &(g->cells[ index_grid_3d(i, j, k, g) ]);
+                cutoff = SQR(g->cutoff[ index_grid_3d(i, j, k, g) ]);
 
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for ( l = g->str[index_grid_3d (i, j, k, g)]; l < g->end[index_grid_3d (i, j, k, g)]; ++l )
+                for ( l = g->str[ index_grid_3d(i, j, k, g) ]; l < g->end[ index_grid_3d( i, j, k, g) ]; ++l )
                 {
                     atom1 = &(system->my_atoms[l]);
-                    if (l == 0) fprintf (stderr, "atom 0 has (%d %d %d) (%f %f %f) \n",
-                                             i, j, k, atom1->x[0], atom1->x[1], atom1->x[2]);
+
+#if defined(DEBUG)
+                    if (l == 0)
+                    {
+                        fprintf (stderr, "atom 0 has (%d %d %d) (%f %f %f) \n",
+                                i, j, k, atom1->x[0], atom1->x[1], atom1->x[2]);
+                    }
+#endif
+
                     //fprintf( stderr, "\tatom %d: ", l );
                     //tmp = num_far; tested = 0;
                     itr = 0;
-                    while ( (g->nbrs_x[index_grid_nbrs (i, j, k, itr, g)][0]) >= 0)
+                    while ( (g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)][0]) >= 0 )
                     {
 
-                        ivec_Copy (nbrs_x, g->nbrs_x[index_grid_nbrs (i, j, k, itr, g)]);
+                        ivec_Copy( nbrs_x, g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)] );
 
-                        if (g->str[index_grid_3d (i, j, k, g)] <= g->str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], g)] &&
-                                (DistSqr_to_Special_Point(g->nbrs_cp[index_grid_nbrs (i, j, k, itr, g)], atom1->x) <= cutoff))
+                        if ( g->str[index_grid_3d(i, j, k, g)] <= g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)] &&
+                                (DistSqr_to_Special_Point(g->nbrs_cp[index_grid_nbrs(i, j, k, itr, g)], atom1->x) <= cutoff) )
                             //fprintf( stderr, "\t\tgcell2: %d\n", itr );
                             /* pick up another atom from the neighbor cell */
                             //for( m = gcj->str; m < gcj->end; ++m )
-                            for ( m = g->str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], g)];
-                                    m < g->end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], g)]; ++m )
+                            for ( m = g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)];
+                                    m < g->end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)]; ++m )
                             {
                                 if ( l < m )
                                 {
@@ -241,26 +261,32 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
                                     dvec[2] = atom2->x[2] - atom1->x[2];
                                     d = rvec_Norm_Sqr( dvec );
                                     if ( d <= cutoff )
+                                    {
                                         ++num_far;
+                                    }
                                 }
                             }
+
                         ++itr;
+
                         //fprintf( stderr, "itr: %d, tested: %d, num_nbrs: %d\n",
                         //   itr, tested, num_far-tmp );
                     }
                 }
             }
-
-    fprintf (stderr, "Total number of host neighbors: %d \n", num_far);
+        }
+    }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: estimate nbrs done - num_far=%d\n",
              system->my_rank, num_far );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
     return MAX( num_far * SAFE_ZONE, MIN_CAP * MIN_NBRS );
 }
 
+
 /*
 int Estimate_NumNeighbors1( reax_system *system, reax_list **lists )
 {
diff --git a/PG-PuReMD/src/neighbors.h b/PG-PuReMD/src/neighbors.h
index 0a1e3daf289883268e77fbefd7f6a24deaa582dd..37c3642b7d243d472dd0eb531c2b89ce6a94a06a 100644
--- a/PG-PuReMD/src/neighbors.h
+++ b/PG-PuReMD/src/neighbors.h
@@ -31,8 +31,18 @@
                      int, int*, int*, int*, int,
                      int, int, real, rvec, ivec );*/
 
-void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*,
-                              reax_list** );
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
+void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*, reax_list** );
+
 int Estimate_NumNeighbors( reax_system*, reax_list** );
 
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c
index c81c5f3e11d70383323e0dda3b63788fa2d48b2a..e073ec6252f2b76cefbfe5e10bb5e3c43d9e006c 100644
--- a/PG-PuReMD/src/nonbonded.c
+++ b/PG-PuReMD/src/nonbonded.c
@@ -20,24 +20,26 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "nonbonded.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "vector.h"
+  #include "nonbonded.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_nonbonded.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_lookup.h"
-#include "reax_vector.h"
+  #include "reax_nonbonded.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_lookup.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void vdW_Coulomb_Energy( reax_system *system, control_params *control,
-                         simulation_data *data, storage *workspace,
-                         reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i, orig_i, orig_j;
@@ -142,7 +144,7 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
                 /*Coulomb Calculations*/
                 dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
 
                 tmp = Tap / dr3gamij_3;
                 data->my_en.e_ele += e_ele =
@@ -450,7 +452,7 @@ void LR_vdW_Coulomb( reax_system *system, real *workspace_Tap,
 
     /* Coulomb calculations */
     dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
 
     tmp = Tap / dr3gamij_3;
     lr->H = EV_to_KCALpMOL * tmp;
diff --git a/PG-PuReMD/src/nonbonded.h b/PG-PuReMD/src/nonbonded.h
index 81613be54f9581c64461ba9f1a3e2002403d63e0..45137bf894c5726b8dbaf1ab9b33905581aed4a4 100644
--- a/PG-PuReMD/src/nonbonded.h
+++ b/PG-PuReMD/src/nonbonded.h
@@ -24,14 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-                         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
 
 void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*,
-                                   simulation_data*, storage*,
-                                   reax_list**, output_controls* );
+        simulation_data*, storage*, reax_list**, output_controls* );
 
 void Compute_Polarization_Energy( reax_system*, simulation_data* );
 
 void LR_vdW_Coulomb( reax_system*, real *, int, int, real, LR_data* );
+
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 5020d59b46d3fef334c3a0d05623594e9f82eff1..9ace1c3276e9598f40d5567460f9c5a57db0955f 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -20,6 +20,8 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
+#include "allocate.h"
 #include "analyze.h"
 #include "comm_tools.h"
 #include "control.h"
@@ -38,10 +40,16 @@
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-#include "cuda_environment.h"
-#include "cuda_post_evolve.h"
-
-#include "validation.h"
+  #include "cuda/cuda_copy.h"
+  #include "cuda/cuda_environment.h"
+  #include "cuda/cuda_forces.h"
+  #include "cuda/cuda_init_md.h"
+  #include "cuda/cuda_neighbors.h"
+  #include "cuda/cuda_post_evolve.h"
+  #include "cuda/cuda_reset_tools.h"
+  #include "cuda/cuda_system_props.h"
+  #include "cuda/cuda_utils.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 evolve_function Evolve;
@@ -49,17 +57,15 @@ evolve_function Cuda_Evolve;
 LR_lookup_table *LR;
 LR_lookup_table *d_LR;
 
-////////////////////////////
-//CUDA SPECIFIC DECLARATIONS
-////////////////////////////
-reax_list   **dev_lists;
-storage         *dev_workspace;
-void            *scratch;
-void            *host_scratch;
+/* CUDA SPECIFIC DECLARATIONS */
+reax_list **dev_lists;
+storage *dev_workspace;
+void *scratch;
+void *host_scratch;
 
-int         BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
-int         BLOCKS_N, BLOCKS_POW_2_N;
-int         MATVEC_BLOCKS;
+int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
+int BLOCKS_N, BLOCKS_POW_2_N;
+int MATVEC_BLOCKS;
 
 
 void Read_System( char *geo_file, char *ffield_file, char *control_file,
@@ -67,7 +73,7 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file,
         storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data )
 {
     /* ffield file */
-    Read_Force_Field( ffield_file, &(system->reax_param), control );
+    Read_Force_Field( ffield_file, &(system->reax_param), system, control );
 
     /* control file */
     Read_Control_File( control_file, control, out_control );
@@ -106,7 +112,7 @@ void Post_Evolve( reax_system* system, control_params* control,
     int i;
     rvec diff, cross;
 
-    /* remove trans & rot velocity of the center of mass from system */
+    /* remove translational and rotational velocity of the center of mass from system */
     if ( control->ensemble != NVE && control->remove_CoM_vel &&
             data->step % control->remove_CoM_vel == 0 )
     {
@@ -115,23 +121,23 @@ void Post_Evolve( reax_system* system, control_params* control,
 
         for ( i = 0; i < system->n; i++ )
         {
-            /* remove translational vel */
+            /* remove translational term */
             rvec_ScaledAdd( system->my_atoms[i].v, -1., data->vcm );
 
-            /* remove rotational */
+            /* remove rotational term */
             rvec_ScaledSum( diff, 1., system->my_atoms[i].x, -1., data->xcm );
             rvec_Cross( cross, data->avcm, diff );
             rvec_ScaledAdd( system->my_atoms[i].v, -1., cross );
         }
     }
 
-    /* compute kinetic energy of the system */
+    /* compute kinetic energy of system */
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 }
 
 
 #ifdef HAVE_CUDA
-void Cuda_Post_Evolve( reax_system* system, control_params* control,
+int Cuda_Post_Evolve( reax_system* system, control_params* control,
         simulation_data* data, storage* workspace, reax_list** lists,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
@@ -142,37 +148,20 @@ void Cuda_Post_Evolve( reax_system* system, control_params* control,
         /* compute velocity of the center of mass */
         Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
 
-        post_evolve_velocities (system, data);
+        post_evolve_velocities( system, data );
     }
 
     /* compute kinetic energy of the system */
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-}
-#endif
-
 
-#ifdef HAVE_CUDA
-void init_blocks(reax_system *system)
-{
-    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
-    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
-
-    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
-    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
-
-    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
-
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
-            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
-#endif
+    return SUCCESS;
 }
 #endif
 
 
-static void usage(char* argv[])
+static void usage( char* argv[] )
 {
-    fprintf(stderr, "usage: ./%s geometry ffield control\n", argv[0]);
+    fprintf( stderr, "usage: ./%s geometry ffield control\n", argv[0] );
 }
 
 
@@ -185,39 +174,20 @@ int main( int argc, char* argv[] )
     reax_list **lists;
     output_controls *out_control;
     mpi_datatypes *mpi_data;
-    int i;
+    int i, ret, retries;
     real t_start = 0, t_elapsed;
+#if defined(DEBUG)
     real t_begin, t_end;
+#endif
 
     if ( argc != 4 )
     {
-        usage(argv);
+        usage( argv );
         exit( INVALID_INPUT );
     }
 
 #ifdef HAVE_CUDA
 
-    /* Remove this debug information later */
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf (stderr, " Size of LR Lookup table %d \n", sizeof (LR_lookup_table) );
-#endif
-
-#if defined( __SM_35__)
-    fprintf (stderr, " nbrs block size: %d \n", NBRS_BLOCK_SIZE);
-    fprintf (stderr, " nbrs threads per atom: %d \n",  NB_KER_THREADS_PER_ATOM);
-
-    fprintf (stderr, " hbonds block size: %d \n",  HB_BLOCK_SIZE);
-    fprintf (stderr, " hbonds threads per atom: %d \n",  HB_KER_THREADS_PER_ATOM);
-
-    fprintf (stderr, " vdw block size: %d \n",  VDW_BLOCK_SIZE);
-    fprintf (stderr, " vdw threads per atom: %d \n",  VDW_KER_THREADS_PER_ATOM);
-
-    fprintf (stderr, " matvec block size: %d \n",  MATVEC_BLOCK_SIZE);
-    fprintf (stderr, " matvec threads per atom: %d \n",  MATVEC_KER_THREADS_PER_ROW);
-
-    fprintf (stderr, " General block size: %d \n",  DEF_BLOCK_SIZE);
-#endif
-
     /* allocate main data structures */
     system = (reax_system *) smalloc( sizeof(reax_system), "system" );
     control = (control_params *) smalloc( sizeof(control_params), "control" );
@@ -227,9 +197,8 @@ int main( int argc, char* argv[] )
     for ( i = 0; i < LIST_N; ++i )
     {
         lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-        lists[i]->allocated = 0;
+        lists[i]->allocated = FALSE;
 
-        //initialize here TODO
         lists[i]->n = 0;
         lists[i]->num_intrs = 0;
         lists[i]->index = NULL;
@@ -238,47 +207,40 @@ int main( int argc, char* argv[] )
     }
     out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" );
     mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
+    mpi_data->in1_buffer = NULL;
+    mpi_data->in2_buffer = NULL;
 
-    /* allocate the cuda auxiliary data structures */
+    /* allocate auxiliary data structures (GPU) */
     dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
     dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
         dev_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-        dev_lists[i]->allocated = 0;
+        dev_lists[i]->allocated = FALSE;
+        lists[i]->n = 0; 
+        lists[i]->num_intrs = 0;
+        lists[i]->index = NULL;
+        lists[i]->end_index = NULL;
+        lists[i]->select.v = NULL;
     }
 
-    /* Initialize member variables */
-    system->init_thblist = FALSE;
-
     /* setup MPI environment */
     MPI_Init( &argc, &argv );
     MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
     MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
-    system->wsize = control->nprocs;
-    system->global_offset = (int *)scalloc(system->wsize + 1, sizeof(int), "global_offset");
-
-    /* setup the CUDA Device for this process can be on the same machine
-    * or on a different machine, for now use the rank to compute the device
-    * This will only work on a single machine with 2 GPUs */
-    Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node );
-    //Cleanup_Cuda_Environment ();
-    //
-#if defined(DEBUG)
-    print_device_mem_usage ();
-    fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
-#endif
 
     /* read system config files */
     Read_System( argv[1], argv[2], argv[3], system, control,
             data, workspace, out_control, mpi_data );
 
+    /* setup the CUDA Device for this process */
+    Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node );
+
 #if defined(DEBUG)
-    fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
-    MPI_Barrier( MPI_COMM_WORLD );
+    print_device_mem_usage( );
 #endif
 
-    /* init the blocks sizes for cuda kernels */
+    /* init blocks sizes */
     init_blocks( system );
 
     /* measure total simulation time after input is read */
@@ -295,97 +257,65 @@ int main( int argc, char* argv[] )
 #endif
 
 #if defined(DEBUG)
-    print_device_mem_usage ();
+    print_device_mem_usage( );
 #endif
 
     /* init the blocks sizes for cuda kernels */
     init_blocks( system );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-    //END OF FIRST STEP
-
-    // compute f_0
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
+    /* compute f_0 */
+    Comm_Atoms( system, control, data, workspace, lists, mpi_data, TRUE );
     Sync_Atoms( system );
     Sync_Grid( &system->my_grid, &system->d_my_grid );
-    init_blocks (system);
-
-#if defined(__CUDA_DENUG_LOG__)
-    fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank );
-#endif
+    init_blocks( system );
 
-    //Second step
-    Cuda_Reset ( system, control, data, workspace, lists );
+    Cuda_Reset( system, control, data, workspace, lists );
 
 #if defined(__CUDA_DEBUG__)
     Reset( system, control, data, workspace, lists );
 #endif
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
-#endif
 
-    //Third Step
     Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
 
 #if defined(__CUDA_DEBUG__)
     Generate_Neighbor_Lists( system, data, workspace, lists );
 #endif
 
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank );
-#endif
-
-
-    //Fourth Step
-#if defined(DEBUG)
-    fprintf (stderr, " Host Compute Forces begin.... \n");
-#endif
-
 #if defined(__CUDA_DEBUG__)
     Compute_Forces( system, control, data, workspace,
-                    lists, out_control, mpi_data );
+            lists, out_control, mpi_data );
 #endif
 
     Cuda_Compute_Forces( system, control, data, workspace, lists,
             out_control, mpi_data );
 
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank );
-#endif
-
 #if defined (__CUDA_DEBUG__)
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 #endif
 
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank);
-#endif
-
 #if defined(__CUDA_DEBUG__)
-    validate_device (system, data, workspace, lists);
+    validate_device( system, data, workspace, lists );
 #endif
 
 #if !defined(__CUDA_DEBUG__)
     Output_Results( system, control, data, lists, out_control, mpi_data );
 #endif
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
-#endif
 
 #if defined(DEBUG)
-    fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
+    fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    // start the simulation
-    for ( ++data->step; data->step <= control->nsteps; data->step++ )
+    /* begin main simulation loop */
+    ++data->step;
+    retries = 0;
+    while ( data->step <= control->nsteps && retries < MAX_RETRIES )
     {
-        if ( control->T_mode )
+        ret = SUCCESS;
+
+        if ( control->T_mode && retries == 0 )
         {
             Temperature_Control( control, data );
         }
@@ -395,11 +325,11 @@ int main( int argc, char* argv[] )
 #endif
 
 #if defined(__CUDA_DEBUG__)
-        Evolve( system, control, data, workspace, lists, out_control, mpi_data );
+        ret = Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
-
-        Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data );
-
+    
+        ret = Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data );
+    
 #if defined(DEBUG)
         t_end = Get_Timing_Info( t_begin );
         fprintf( stderr, " Evolve time: %f \n", t_end );
@@ -409,7 +339,11 @@ int main( int argc, char* argv[] )
         t_begin = Get_Time();
 #endif
 
-        Cuda_Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+        if ( ret == SUCCESS )
+        {
+            ret = Cuda_Post_Evolve( system, control, data, workspace, lists,
+                    out_control, mpi_data );
+        }
 
 #if defined(__CUDA_DEBUG__)
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
@@ -420,35 +354,60 @@ int main( int argc, char* argv[] )
         fprintf( stderr, " Post Evolve time: %f \n", t_end );
 #endif
 
+        if ( ret == SUCCESS )
+        {
+            data->timing.num_retries = retries;
+
 #if !defined(__CUDA_DEBUG__)
-        Output_Results( system, control, data, lists, out_control, mpi_data );
+            Output_Results( system, control, data, lists, out_control, mpi_data );
 #endif
 
-        //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
+//        Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
-        // dump restart info
-//    if( out_control->restart_freq &&
-//  (data->step-data->prev_steps) % out_control->restart_freq == 0 ) {
-//      if( out_control->restart_format == WRITE_ASCII )
-//  Write_Restart( system, control, data, out_control, mpi_data );
-//      else if( out_control->restart_format == WRITE_BINARY )
-//  Write_Binary_Restart( system, control, data, out_control, mpi_data );
-//    }
+        /* dump restart info */
+//        if ( out_control->restart_freq &&
+//                (data->step-data->prev_steps) % out_control->restart_freq == 0 )
+//        {
+//            if( out_control->restart_format == WRITE_ASCII )
+//            {
+//                Write_Restart( system, control, data, out_control, mpi_data );
+//            }
+//            else if( out_control->restart_format == WRITE_BINARY )
+//            {
+//                Write_Binary_Restart( system, control, data, out_control, mpi_data );
+//            }
+//        }
 
 #if defined(DEBUG)
-        fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
+            fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
+            MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
+            ++data->step;
+            retries = 0;
+        }
+        else
+        {
+            ++retries;
+#if defined(DEBUG)
+            fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step );
+#endif
+        }
+    }
+
+    if ( retries >= MAX_RETRIES )
+    {
+        fprintf( stderr, "[ERROR] Maximum retries reached for this step (%d). Terminating...\n",
+              retries );
+        MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );
     }
 
 #if defined(__CUDA_DEBUG__)
-    // vaildate the results in debug mode
-    validate_device (system, data, workspace, lists);
+    /* vaildate the results in debug mode */
+    validate_device( system, data, workspace, lists );
 #endif
 
 #else 
-
     /* allocate main data structures */
     system = (reax_system *) smalloc( sizeof(reax_system), "system" );
     control = (control_params *) smalloc( sizeof(control_params), "control" );
@@ -458,10 +417,9 @@ int main( int argc, char* argv[] )
     lists = (reax_list **) smalloc( LIST_N * sizeof(reax_list*), "lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
+        // initialize here
 	lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-        lists[i]->allocated = 0;
-
-        //initialize here TODO
+        lists[i]->allocated = FALSE;
         lists[i]->n = 0; 
         lists[i]->num_intrs = 0;
         lists[i]->index = NULL;
@@ -471,25 +429,20 @@ int main( int argc, char* argv[] )
     out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" );
     mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
 
+    //TODO: remove?
     /* allocate the cuda auxiliary data structures */
     dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
-    dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" );
+    dev_lists = (reax_list **) smalloc( LIST_N * sizeof(reax_list *), "dev_lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
         dev_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-        dev_lists[i]->allocated = 0;
+        dev_lists[i]->allocated = FALSE;
     }
 
-    /* Initialize member variables */
-    system->init_thblist = FALSE;
-
     /* setup MPI environment */
     MPI_Init( &argc, &argv );
     MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
     MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
-    system->wsize = control->nprocs;
-    system->global_offset = (int*) scalloc( system->wsize + 1,
-            sizeof(int), "global_offset" );
 
     /* read system config files */
     Read_System( argv[1], argv[2], argv[3], system, control,
@@ -515,7 +468,7 @@ int main( int argc, char* argv[] )
 #endif
 
     /* compute f_0 */
-    Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
+    Comm_Atoms( system, control, data, workspace, lists, mpi_data, TRUE );
     Reset( system, control, data, workspace, lists );
 
 #if defined(DEBUG)
@@ -533,36 +486,64 @@ int main( int argc, char* argv[] )
 #endif
 
     /* start the simulation */
-    for ( ++data->step; data->step <= control->nsteps; data->step++ )
+    retries = 0;
+    while ( data->step <= control->nsteps && retries < MAX_RETRIES )
     {
-        if ( control->T_mode )
+        ret = SUCCESS;
+
+        if ( control->T_mode && retries == 0 )
         {
             Temperature_Control( control, data );
         }
 
-        Evolve( system, control, data, workspace, lists, out_control, mpi_data );
-        Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
-        Output_Results( system, control, data, lists, out_control, mpi_data );
-        //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
+        ret = Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 
-        /* dump restart info */
-        if ( out_control->restart_freq &&
-                (data->step - data->prev_steps) % out_control->restart_freq == 0 )
+        if ( ret == SUCCESS )
         {
-            if ( out_control->restart_format == WRITE_ASCII )
-            {
-                Write_Restart( system, control, data, out_control, mpi_data );
-            }
-            else if ( out_control->restart_format == WRITE_BINARY )
+            Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+        }
+
+        if ( ret == SUCCESS )
+        {
+            data->timing.num_retries = retries;
+
+            Output_Results( system, control, data, lists, out_control, mpi_data );
+
+//            Analysis(system, control, data, workspace, lists, out_control, mpi_data);
+
+            /* dump restart info */
+            if ( out_control->restart_freq &&
+                    (data->step - data->prev_steps) % out_control->restart_freq == 0 )
             {
-                Write_Binary_Restart( system, control, data, out_control, mpi_data );
+                if ( out_control->restart_format == WRITE_ASCII )
+                {
+                    Write_Restart( system, control, data, out_control, mpi_data );
+                }
+                else if ( out_control->restart_format == WRITE_BINARY )
+                {
+                    Write_Binary_Restart( system, control, data, out_control, mpi_data );
+                }
             }
-        }
 
 #if defined(DEBUG)
-        fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
-        MPI_Barrier( mpi_data->world );
+            fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
+            MPI_Barrier( mpi_data->world );
 #endif
+
+            ++data->step;
+            retries = 0;
+        }
+        else
+        {
+            ++retries;
+            fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step );
+        }
+    }
+
+    if ( retries >= MAX_RETRIES )
+    {
+        fprintf( stderr, "Maximum retries reached for this step. Terminating...\n" );
+        MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );
     }
     
 #endif
@@ -574,31 +555,22 @@ int main( int argc, char* argv[] )
         fprintf( out_control->out, "Total Simulation Time: %.2f secs\n", t_elapsed );
     }
 
-    // Write_PDB( &system, &(lists[BOND]), &out_control );
+//    Write_PDB( &system, &(lists[BOND]), &out_control );
     Close_Output_Files( system, control, out_control, mpi_data );
 
-    //Cleanup_Cuda_Environment ();
-
-    MPI_Finalize();
-
-    /* de-allocate data structures */
-    //for( i = 0; i < LIST_N; ++i ) {
-    //if (lists[i]->index) free (lists[i]->index);
-    //if (lists[i]->end_index) free (lists[i]->end_index);
-    //if (lists[i]->select.v) free (lists[i]->select.v);
-    //free (lists[i] );
-    //}
+    MPI_Finalize( );
 
-    free( system );
-    free( control );
-    free( data );
-    free( workspace );
-    free( lists );
-    free( out_control );
-    free( mpi_data );
+    /* deallocate data structures */
+    sfree( system, "main::system" );
+    sfree( control, "main::control" );
+    sfree( data, "main::data" );
+    sfree( workspace, "main::workspace" );
+    sfree( lists, "main::lists" );
+    sfree( out_control, "main::out_control" );
+    sfree( mpi_data, "main::mpi_data" );
 
 #if defined(TEST_ENERGY) || defined(TEST_FORCES)
-//  Integrate_Results(control);
+//    Integrate_Results(control);
 #endif
 
 #if defined(DEBUG)
diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c
index 8bc540b4d66bf578b51fc3c29b1e216d774f59a7..f91528305a29caf77a2ec143fdca300b9d65ddcd 100644
--- a/PG-PuReMD/src/random.c
+++ b/PG-PuReMD/src/random.c
@@ -21,26 +21,29 @@
 
 #include "random.h"
 
+
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-double Random(double range)
+double Random( double range )
 {
-    return (random() * range) / 2147483647L;
+    return (random( ) * range) / 2147483647L;
 }
 
+
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-void Randomize()
+void Randomize( )
 {
-    srandom(time(NULL));
+    srandom( time(NULL) );
 }
 
+
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-double GRandom(double mean, double sigma)
+double GRandom( double mean, double sigma )
 {
     double v1 = Random(2.0) - 1.0;
     double v2 = Random(2.0) - 1.0;
@@ -53,5 +56,5 @@ double GRandom(double mean, double sigma)
         rsq = v1 * v1 + v2 * v2;
     }
 
-    return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
 }
diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h
index a3ce35265758ec136e0994cf439d28a57d068183..d26de8f0086a0b66bdf3b43fe435173aedeb1077 100644
--- a/PG-PuReMD/src/random.h
+++ b/PG-PuReMD/src/random.h
@@ -24,19 +24,29 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-double Random(double);
+double Random( double );
 
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-void Randomize();
+void Randomize( );
 
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-double GRandom(double, double);
+double GRandom( double, double );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index 8c353a7ca600b369ad69b2bdabd8c8e883915d21..192cecbee8c6ee10ac91880284d1394dd806bd2e 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -53,9 +53,9 @@
 #include <sys/time.h>
 #include <time.h>
 #include <zlib.h>
-#define HOST_SCRATCH_SIZE (1024 * 1024 * 20)
 #ifdef HAVE_CUDA
   #include <cuda.h>
+  #include <cuda_runtime.h>
 #endif
 
 #if defined(__IBMC__)
@@ -82,15 +82,27 @@
 
 #define EXP    exp
 #define EXP2   exp2
+#define LOG    log
 #define LOG2   log2
 #define SQRT   sqrt
 #define POW    pow
-#define ACOS   acos
 #define COS    cos
+#define ACOS   acos
 #define SIN    sin
 #define TAN    tan
+#define ATAN2  atan2
 #define CEIL   ceil
 #define FLOOR  floor
+#define FABS   fabs
+#define FMOD   fmod
+
+/* transcendental constant pi */
+#if defined(M_PI)
+  /* GNU C library (libc), defined in math.h */
+  #define PI (M_PI)
+#else
+  #define PI            3.14159265
+#endif
 
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
@@ -100,26 +112,39 @@
 #define MIN(x,y)      (((x) < (y)) ? (x) : (y))
 #define MAX3(x,y,z)   MAX( MAX(x,y), z)
 
-#define PI            3.14159265
+/* ??? */
 #define C_ele          332.06371
+/* ??? */
 //#define K_B         503.398008   // kcal/mol/K
 #define K_B             0.831687   // amu A^2 / ps^2 / K
+/* ??? */
 #define F_CONV          1e6 / 48.88821291 / 48.88821291   // --> amu A / ps^2
+/**/
 #define E_CONV          0.002391   // amu A^2 / ps^2 --> kcal/mol
-#define EV_to_KCALpMOL 14.400000   // ElectronVolt --> KCAL per MOLe
-#define KCALpMOL_to_EV 23.060549   // 23.020000 //KCAL per MOLe --> ElectronVolt
+/* conversion factor from electron volts to kilo calories per mole  */
+#define EV_to_KCALpMOL 14.400000
+/* conversion factor from kilo calories per mode to electron volts */
+#define KCALpMOL_to_EV 23.060549   // 23.020000
+/* conversion factor from (elemental charge * angstroms) to debye */
 #define ECxA_to_DEBYE   4.803204   // elem. charge * Ang -> debye
-#define CAL_to_JOULES   4.184000   // CALories --> JOULES
-#define JOULES_to_CAL   1/4.184000 // JOULES --> CALories
+/* conversion factor from calories to joules */
+#define CAL_to_JOULES   4.184000
+/* conversion factor from joules to calories */
+#define JOULES_to_CAL   1/4.184000
+/* conversion factor from (unified) atomic mass units to grams */
 #define AMU_to_GRAM     1.6605e-24
+/* conversion factor from angstroms to centimenters */
 #define ANG_to_CM       1e-8
+/* Avogadro's constant */
 #define AVOGNR          6.0221367e23
+/* ??? */
 #define P_CONV          1e-24 * AVOGNR * JOULES_to_CAL
 
 #define MAX_STR             1024
 #define MAX_LINE            1024
 #define MAX_TOKENS          1024
 #define MAX_TOKEN_LEN       1024
+#define MAX_ATOM_NAME_LEN   8
 
 #define MAX_ATOM_ID         100000
 #define MAX_RESTRICT        15
@@ -134,7 +159,7 @@
 
 #define MIN_CAP        50
 #define MIN_NBRS       100
-#define MIN_HENTRIES   100
+#define MIN_CM_ENTRIES 100
 #define MAX_BONDS      30
 #define MIN_BONDS      15
 #define MIN_HBONDS     25
@@ -155,369 +180,656 @@
 
 #define MASTER_NODE 0
 #define MAX_NBRS 6 //27
-#define MYSELF   13  // encoding of relative coordinate (0,0,0)
+/* encoding of relative coordinate (0,0,0) */
+#define MYSELF 13
 
 #define MAX_ITR 10
 #define RESTART 30
 
+#define MAX_RETRIES 20
+
 /* NaN IEEE 754 representation for C99 in math.h
  * Note: function choice must match REAL typedef below */
-#ifdef NAN
-#define IS_NAN_REAL(a) (isnan(a))
+#if defined(NAN)
+  #define IS_NAN_REAL(a) (isnan(a))
 #else
-#warn "No support for NaN"
-#define NAN_REAL(a) (0)
+  #warn "No support for NaN"
+  #define NAN_REAL(a) (0)
 #endif
 
 /**************** RESOURCE CONSTANTS **********************/
+/* 500 MB */
+#define HOST_SCRATCH_SIZE               (1024 * 1024 * 500)
 #ifdef HAVE_CUDA
-//#define           CUDA_BLOCK_SIZE             256
-#define         SCRATCH_SIZE                    (1024 * 1024 * 20)
-#define         HOST_SCRATCH_SIZE               (1024 * 1024 * 20)
-#define         RES_SCRATCH                     0x90
+/* 500 MB */
+#define DEVICE_SCRATCH_SIZE             (1024 * 1024 * 500)
+/* 500 MB */
+#define RES_SCRATCH                     0x90
 
 /* BLOCK SIZES for kernels */
-#define             HB_SYM_BLOCK_SIZE                   64
-#define             HB_KER_SYM_THREADS_PER_ATOM         16
-#define             HB_POST_PROC_BLOCK_SIZE             256
-#define             HB_POST_PROC_KER_THREADS_PER_ATOM   32
+#define HB_SYM_BLOCK_SIZE                   64
+#define HB_KER_SYM_THREADS_PER_ATOM         16
+#define HB_POST_PROC_BLOCK_SIZE             256
+#define HB_POST_PROC_KER_THREADS_PER_ATOM   32
 
 #if defined( __INIT_BLOCK_SIZE__)
-#define             DEF_BLOCK_SIZE                      __INIT_BLOCK_SIZE__    /* all utility functions and all */
-#define             CUDA_BLOCK_SIZE                     __INIT_BLOCK_SIZE__     /* init forces */
-#define             ST_BLOCK_SIZE                       __INIT_BLOCK_SIZE__
+  #define DEF_BLOCK_SIZE                      __INIT_BLOCK_SIZE__    /* all utility functions and all */
+  #define CUDA_BLOCK_SIZE                     __INIT_BLOCK_SIZE__     /* init forces */
+  #define ST_BLOCK_SIZE                       __INIT_BLOCK_SIZE__
 #else
-#define             DEF_BLOCK_SIZE                      256                     /* all utility functions and all */
-#define             CUDA_BLOCK_SIZE                     256                     /* init forces */
-#define             ST_BLOCK_SIZE                       256
+  #define DEF_BLOCK_SIZE                      256                     /* all utility functions and all */
+  #define CUDA_BLOCK_SIZE                     256                     /* init forces */
+  #define ST_BLOCK_SIZE                       256
 #endif
 
 #if defined( __NBRS_THREADS_PER_ATOM__ )
-#define             NB_KER_THREADS_PER_ATOM             __NBRS_THREADS_PER_ATOM__
+  #define NB_KER_THREADS_PER_ATOM             __NBRS_THREADS_PER_ATOM__
 #else
-#define             NB_KER_THREADS_PER_ATOM             16
+  #define NB_KER_THREADS_PER_ATOM             16
 #endif
 
 #if defined( __NBRS_BLOCK_SIZE__)
-#define             NBRS_BLOCK_SIZE                     __NBRS_BLOCK_SIZE__
+  #define NBRS_BLOCK_SIZE                     __NBRS_BLOCK_SIZE__
 #else
-#define             NBRS_BLOCK_SIZE                     256
+  #define NBRS_BLOCK_SIZE                     256
 #endif
 
 #if defined( __HB_THREADS_PER_ATOM__)
-#define             HB_KER_THREADS_PER_ATOM             __HB_THREADS_PER_ATOM__
+  #define HB_KER_THREADS_PER_ATOM             __HB_THREADS_PER_ATOM__
 #else
-#define             HB_KER_THREADS_PER_ATOM             32
+  #define HB_KER_THREADS_PER_ATOM             32
 #endif
 
 #if defined(__HB_BLOCK_SIZE__)
-#define             HB_BLOCK_SIZE                   __HB_BLOCK_SIZE__
+  #define HB_BLOCK_SIZE                   __HB_BLOCK_SIZE__
 #else
-#define             HB_BLOCK_SIZE                       256
+  #define HB_BLOCK_SIZE                       256
 #endif
 
 #if defined( __VDW_THREADS_PER_ATOM__ )
-#define             VDW_KER_THREADS_PER_ATOM            __VDW_THREADS_PER_ATOM__
+  #define VDW_KER_THREADS_PER_ATOM            __VDW_THREADS_PER_ATOM__
 #else
-#define             VDW_KER_THREADS_PER_ATOM            32
+  #define VDW_KER_THREADS_PER_ATOM            32
 #endif
 
 #if defined( __VDW_BLOCK_SIZE__)
-#define             VDW_BLOCK_SIZE                      __VDW_BLOCK_SIZE__
+  #define VDW_BLOCK_SIZE                      __VDW_BLOCK_SIZE__
 #else
-#define             VDW_BLOCK_SIZE                      256
+  #define VDW_BLOCK_SIZE                      256
 #endif
 
 #if defined( __MATVEC_THREADS_PER_ROW__ )
-#define             MATVEC_KER_THREADS_PER_ROW      __MATVEC_THREADS_PER_ROW__
+  #define MATVEC_KER_THREADS_PER_ROW      __MATVEC_THREADS_PER_ROW__
 #else
-#define             MATVEC_KER_THREADS_PER_ROW      32
+  #define MATVEC_KER_THREADS_PER_ROW      32
 #endif
 
 #if defined( __MATVEC_BLOCK_SIZE__)
-#define             MATVEC_BLOCK_SIZE                   __MATVEC_BLOCK_SIZE__
+  #define MATVEC_BLOCK_SIZE                   __MATVEC_BLOCK_SIZE__
 #else
-#define             MATVEC_BLOCK_SIZE                   512
+  #define MATVEC_BLOCK_SIZE                   512
 #endif
 
 //Validation
-#define             GPU_TOLERANCE               1e-5
+#define GPU_TOLERANCE               1e-5
 
 #endif
 
 
-
 /******************* ENUMERATIONS *************************/
-enum geo_formats { CUSTOM = 0, PDB = 1, ASCII_RESTART = 2, BINARY_RESTART = 3, GF_N = 4 };
+/* ensemble type */
+enum ensembles
+{
+    NVE = 0,
+    bNVT = 1,
+    nhNVT = 2,
+    sNPT = 3,
+    iNPT = 4,
+    NPT = 5,
+    ens_N = 6,
+};
 
-enum restart_formats { WRITE_ASCII = 0, WRITE_BINARY = 1, RF_N = 2 };
+/* interaction list type */
+enum lists
+{
+    BONDS = 0,
+    OLD_BONDS = 1,
+    THREE_BODIES = 2,
+    HBONDS = 3,
+    FAR_NBRS = 4,
+    DBOS = 5,
+    DDELTAS = 6,
+    LIST_N = 7,
+};
 
-enum ensembles { NVE = 0, bNVT = 1, nhNVT = 2, sNPT = 3, iNPT = 4, NPT = 5, ens_N = 6 };
+/* interaction type */
+enum interactions
+{
+    TYP_VOID = 0,
+    TYP_BOND = 1,
+    TYP_THREE_BODY = 2,
+    TYP_HBOND = 3,
+    TYP_FAR_NEIGHBOR = 4,
+    TYP_DBO = 5,
+    TYP_DDELTA = 6,
+    TYP_N = 7,
+};
 
-enum lists { BONDS = 0, OLD_BONDS = 1, THREE_BODIES = 2,
-             HBONDS = 3, FAR_NBRS = 4, DBOS = 5, DDELTAS = 6, LIST_N = 7
-           };
+/* MPI message tags */
+enum message_tags
+{
+    INIT = 0,
+    UPDATE = 1,
+    BNDRY = 2,
+    UPDATE_BNDRY = 3,
+    EXC_VEC1 = 4,
+    EXC_VEC2 = 5,
+    DIST_RVEC2 = 6,
+    COLL_RVEC2 = 7,
+    DIST_RVECS = 8,
+    COLL_RVECS = 9,
+    INIT_DESCS = 10,
+    ATOM_LINES = 11,
+    BOND_LINES = 12,
+    ANGLE_LINES = 13,
+    RESTART_ATOMS = 14,
+    TAGS_N = 15,
+};
 
-enum interactions { TYP_VOID = 0, TYP_BOND = 1, TYP_THREE_BODY = 2,
-                    TYP_HBOND = 3, TYP_FAR_NEIGHBOR = 4, TYP_DBO = 5, TYP_DDELTA = 6, TYP_N = 7
-                  };
+/* error codes for simulation termination */
+enum errors
+{
+    FILE_NOT_FOUND = -10,
+    UNKNOWN_ATOM_TYPE = -11,
+    CANNOT_OPEN_FILE = -12,
+    CANNOT_INITIALIZE = -13,
+    INSUFFICIENT_MEMORY = -14,
+    UNKNOWN_OPTION = -15,
+    INVALID_INPUT = -16,
+    INVALID_GEO = -17,
+    MAX_RETRIES_REACHED = -18,
+    RUNTIME_ERROR = -19,
+};
 
-enum message_tags { INIT = 0, UPDATE = 1, BNDRY = 2, UPDATE_BNDRY = 3,
-                    EXC_VEC1 = 4, EXC_VEC2 = 5, DIST_RVEC2 = 6, COLL_RVEC2 = 7,
-                    DIST_RVECS = 8, COLL_RVECS = 9, INIT_DESCS = 10, ATOM_LINES = 11,
-                    BOND_LINES = 12, ANGLE_LINES = 13, RESTART_ATOMS = 14, TAGS_N = 15
-                  };
+/* restart file format */
+enum restart_formats
+{
+    WRITE_ASCII = 0,
+    WRITE_BINARY = 1,
+    RF_N = 2,
+};
 
-enum errors { FILE_NOT_FOUND = -10, UNKNOWN_ATOM_TYPE = -11,
-              CANNOT_OPEN_FILE = -12, CANNOT_INITIALIZE = -13,
-              INSUFFICIENT_MEMORY = -14, UNKNOWN_OPTION = -15,
-              INVALID_INPUT = -16, INVALID_GEO = -17
-            };
+/* geometry file format */
+enum geo_formats
+{
+    CUSTOM = 0,
+    PDB = 1,
+    ASCII_RESTART = 2,
+    BINARY_RESTART = 3,
+    GF_N = 4,
+};
 
-enum exchanges { NONE = 0, NEAR_EXCH = 1, FULL_EXCH = 2 };
+enum charge_method
+{
+    QEQ_CM = 0,
+    EE_CM = 1,
+    ACKS2_CM = 2,
+};
+
+enum solver
+{
+    GMRES_S = 0,
+    GMRES_H_S = 1,
+    CG_S = 2,
+    SDM_S = 3,
+};
 
-enum gcell_types { NO_NBRS = 0, NEAR_ONLY = 1, HBOND_ONLY = 2, FAR_ONLY = 4,
-                   NEAR_HBOND = 3, NEAR_FAR = 5, HBOND_FAR = 6, FULL_NBRS = 7,
-                   NATIVE = 8
-                 };
+enum pre_comp
+{
+    NONE_PC = 0,
+    DIAG_PC = 1,
+    ICHOLT_PC = 2,
+    ILU_PAR_PC = 3,
+    ILUT_PAR_PC = 4,
+    ILU_SUPERLU_MT_PC = 5,
+};
+
+enum pre_app
+{
+    TRI_SOLVE_PA = 0,
+    TRI_SOLVE_LEVEL_SCHED_PA = 1,
+    TRI_SOLVE_GC_PA = 2,
+    JACOBI_ITER_PA = 3,
+};
 
-enum atoms { C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
-             S_ATOM = 4, SI_ATOM = 5, GE_ATOM = 6, X_ATOM = 7
-           };
+/* ??? */
+enum exchanges
+{
+    NONE = 0,
+    NEAR_EXCH = 1,
+    FULL_EXCH = 2,
+};
 
-enum traj_methods { REG_TRAJ = 0, MPI_TRAJ = 1, TF_N = 2 };
+/* ??? */
+enum gcell_types
+{
+    NO_NBRS = 0,
+    NEAR_ONLY = 1,
+    HBOND_ONLY = 2,
+    FAR_ONLY = 3,
+    NEAR_HBOND = 4,
+    NEAR_FAR = 5,
+    HBOND_FAR = 6,
+    FULL_NBRS = 7,
+    NATIVE = 8,
+};
 
-enum molecules { UNKNOWN = 0, WATER = 1 };
+/* atom types as pertains to hydrogen bonding */
+enum hydrogen_bonding_atom_types
+{
+    NON_H_BONDING_ATOM = -1,
+    H_ATOM = 1,
+    H_BONDING_ATOM = 2,
+};
 
-enum list_on { TYP_HOST = 0, TYP_DEVICE = 1 };
+/* trajectory file formats */
+enum traj_methods
+{
+    REG_TRAJ = 0,
+    MPI_TRAJ = 1,
+    TF_N = 2,
+};
 
+/* ??? */
+enum molecules
+{
+    UNKNOWN = 0,
+    WATER = 1,
+};
 
 
 /********************** TYPE DEFINITIONS ********************/
-typedef int  ivec[3];
+/* 3D vector, integer values */
+typedef int ivec[3];
+/* double precision floating point */
 typedef double real;
+/* 3D vector, double precision floating point values */
 typedef real rvec[3];
+/* 3D tensor, double precision floating point values */
 typedef real rtensor[3][3];
+/* 2D vector, double precision floating point values */
 typedef real rvec2[2];
+/* 4D vector, double precision floating point values */
 typedef real rvec4[4];
 
 
+/* header used in restart file */
 typedef struct
 {
-    int step, bigN;
-    real T, xi, v_xi, v_xi_old, G_xi;
+    /* current simulation time step */
+    int step;
+    /* total num. atoms in simulation */
+    int bigN;
+    /* thermostat temperature */
+    real T;
+    /* thrmostat ??? */
+    real xi;
+    /* thrmostat ??? */
+    real v_xi;
+    /* thrmostat ??? */
+    real v_xi_old;
+    /* thrmostat ??? */
+    real G_xi;
+    /* ??? */
     rtensor box;
 } restart_header;
 
+
+/* atom type used for restarting simulation */
 typedef struct
 {
-    int orig_id, type;
-    char name[8];
-    rvec x, v;
+    /* atom serial number as given in the geo file */
+    int orig_id;
+    /* non-negative integer used to indicate atom type,
+     * as identified by short element string in force field file (single
+     * body parameters section) */
+    int type;
+    /* atom name as given in the geo file */
+    char name[MAX_ATOM_NAME_LEN];
+    /* atomic position, 3D */
+    rvec x;
+    /* atomic velocity, 3D */
+    rvec v;
 } restart_atom;
 
+
+/* atom type used for MPI communications */
 typedef struct
 {
-    int  orig_id;
-    int  imprt_id;
-    int  type;
-    int  num_bonds;
-    int  num_hbonds;
-    //int  pad;  // pad to 8-byte address boundary
-    char name[8];
-    rvec x;     // position
-    rvec v;     // velocity
-    rvec f_old; // old force
-    rvec4 s, t;  // for calculating q
+    /* atom serial number as given in the geo file */
+    int orig_id;
+    /* local atom ID on neighbor processor ??? */
+    int imprt_id;
+    /* non-negative integer used to indicate atom type,
+     * as identified by short element string in force field file (single
+     * body parameters section) */
+    int type;
+    /* num. bonds associated with atom */
+    int num_bonds;
+    /* num. hydrogren bonds associated with atom */
+    int num_hbonds;
+    /* pad to 8-byte address boundary */
+    //int  pad;
+    /* atom name as given in the geo file */
+    char name[MAX_ATOM_NAME_LEN];
+    /* atomic position, 3D */
+    rvec x;
+    /* atomic velocity, 3D */
+    rvec v;
+    /* net force acting upon atom in previous time step, 3D */
+    rvec f_old;
+    /* atomic fictitious charge used during QEq to compute atomic charge,
+     * multiple entries used to hold old values for extrapolation */
+    rvec4 s;
+    /* atomic fictitious charge used during QEq to compute atomic charge,
+     * multiple entries used to hold old values for extrapolation */
+    rvec4 t;
 } mpi_atom;
 
 
+/* atom type used for MPI communications at boundary regions */
 typedef struct
 {
-    int  orig_id;
-    int  imprt_id;
-    int  type;
-    int  num_bonds;
-    int  num_hbonds;
-    //int  pad;
-    rvec x;     // position
+    /* atom serial number as given in the geo file */
+    int orig_id;
+    /* local atom ID on neighbor processor ??? */
+    int imprt_id;
+    /* non-negative integer used to indicate atom type,
+     * as identified by short element string in force field file (single
+     * body parameters section) */
+    int type;
+    /* num. bonds associated with atom */
+    int num_bonds;
+    /* num. hydrogren bonds associated with atom */
+    int num_hbonds;
+    /* pad to 8-byte address boundary */
+    //int pad;
+    /* atomic position, 3D */
+    rvec x;
 } boundary_atom;
 
 
+/**/
 typedef struct
 {
-    //int  ncells;
-    //int *cnt_by_gcell;
-
-    int  cnt;
-    //int *block;
+    /**/
+    int cnt;
+    /**/
     int *index;
-    //MPI_Datatype out_dtype;
+    /**/
     void *out_atoms;
 } mpi_out_data;
 
 
+/**/
 typedef struct
 {
-    MPI_Comm     world;
-    MPI_Comm     comm_mesh3D;
+    /**/
+    MPI_Comm world;
+    /**/
+    MPI_Comm comm_mesh3D;
 
+    /**/
     MPI_Datatype sys_info;
+    /**/
     MPI_Datatype mpi_atom_type;
+    /**/
     MPI_Datatype boundary_atom_type;
-    MPI_Datatype mpi_rvec, mpi_rvec2;
+    /**/
+    MPI_Datatype mpi_rvec;
+    /**/
+    MPI_Datatype mpi_rvec2;
+    /**/
     MPI_Datatype restart_atom_type;
 
+    /**/
     MPI_Datatype header_line;
+    /**/
     MPI_Datatype header_view;
+    /**/
     MPI_Datatype init_desc_line;
+    /**/
     MPI_Datatype init_desc_view;
+    /**/
     MPI_Datatype atom_line;
+    /**/
     MPI_Datatype atom_view;
+    /**/
     MPI_Datatype bond_line;
+    /**/
     MPI_Datatype bond_view;
+    /**/
     MPI_Datatype angle_line;
+    /**/
     MPI_Datatype angle_view;
 
-    //MPI_Request  send_req1[MAX_NBRS];
-    //MPI_Request  send_req2[MAX_NBRS];
-    //MPI_Status   send_stat1[MAX_NBRS];
-    //MPI_Status   send_stat2[MAX_NBRS];
-    //MPI_Status   recv_stat1[MAX_NBRS];
-    //MPI_Status   recv_stat2[MAX_NBRS];
-
+    /**/
     mpi_out_data out_buffers[MAX_NBRS];
+    /**/
     void *in1_buffer;
+    /**/
     void *in2_buffer;
 } mpi_datatypes;
 
 
-/* Global params mapping */
-/*
-l[0]  = p_boc1
-l[1]  = p_boc2
-l[2]  = p_coa2
-l[3]  = N/A
-l[4]  = N/A
-l[5]  = N/A
-l[6]  = p_ovun6
-l[7]  = N/A
-l[8]  = p_ovun7
-l[9]  = p_ovun8
-l[10] = N/A
-l[11] = swa
-l[12] = swb
-l[13] = N/A
-l[14] = p_val6
-l[15] = p_lp1
-l[16] = p_val9
-l[17] = p_val10
-l[18] = N/A
-l[19] = p_pen2
-l[20] = p_pen3
-l[21] = p_pen4
-l[22] = N/A
-l[23] = p_tor2
-l[24] = p_tor3
-l[25] = p_tor4
-l[26] = N/A
-l[27] = p_cot2
-l[28] = p_vdW1
-l[29] = v_par30
-l[30] = p_coa4
-l[31] = p_ovun4
-l[32] = p_ovun3
-l[33] = p_val8
-l[34] = N/A
-l[35] = N/A
-l[36] = N/A
-l[37] = version number
-l[38] = p_coa3
-*/
-
+/* Global parameters in force field parameters file, mapping:
+ *
+ * l[0]  = p_boc1
+ * l[1]  = p_boc2
+ * l[2]  = p_coa2
+ * l[3]  = N/A
+ * l[4]  = N/A
+ * l[5]  = N/A
+ * l[6]  = p_ovun6
+ * l[7]  = N/A
+ * l[8]  = p_ovun7
+ * l[9]  = p_ovun8
+ * l[10] = N/A
+ * l[11] = swa
+ * l[12] = swb
+ * l[13] = N/A
+ * l[14] = p_val6
+ * l[15] = p_lp1
+ * l[16] = p_val9
+ * l[17] = p_val10
+ * l[18] = N/A
+ * l[19] = p_pen2
+ * l[20] = p_pen3
+ * l[21] = p_pen4
+ * l[22] = N/A
+ * l[23] = p_tor2
+ * l[24] = p_tor3
+ * l[25] = p_tor4
+ * l[26] = N/A
+ * l[27] = p_cot2
+ * l[28] = p_vdW1
+ * l[29] = v_par30
+ * l[30] = p_coa4
+ * l[31] = p_ovun4
+ * l[32] = p_ovun3
+ * l[33] = p_val8
+ * l[34] = N/A
+ * l[35] = N/A
+ * l[36] = N/A
+ * l[37] = version number
+ * l[38] = p_coa3
+ * */
 typedef struct
 {
+    /* num. of global parameters, from the force field file */
     int n_global;
+    /* global parameters, see above mapping */
     real* l;
+    /* van der Waals interaction type, values:
+     * 0: none (???)
+     * 1: shielded Morse, no inner-wall
+     * 2: inner wall, no shielding
+     * 3: inner wall + shielding
+     * */
     int vdw_type;
 } global_parameters;
 
 
-
+/* single body parameters in force field parameters file */
 typedef struct
 {
     /* Line one in field file */
-    char name[15]; // Two character atom name
-
+    /* two character atom name */
+    char name[15];
+    /**/
     real r_s;
-    real valency;  // Valency of the atom
-    real mass;     // Mass of atom
+    /* valency of the atom */
+    real valency;
+    /* mass of atom */
+    real mass;
+    /**/
     real r_vdw;
+    /**/
     real epsilon;
+    /**/
     real gamma;
+    /**/
     real r_pi;
+    /**/
     real valency_e;
+    /**/
     real nlp_opt;
 
     /* Line two in field file */
+    /**/
     real alpha;
+    /**/
     real gamma_w;
+    /**/
     real valency_boc;
+    /**/
     real p_ovun5;
+    /**/
     real chi;
+    /**/
     real eta;
-    int  p_hbond; // 1 for H, 2 for hbonding atoms (O,S,P,N), 0 for others
+    /* info related to hydrogen bonding
+     * (values correspond to hydrogen_bonding_atom_types enum above):
+     *  0: non-hydrogen bonding atom
+     *  1: H atom
+     *  2: hydrogen bonding atom (e.g., O, S, P, N) */
+    int p_hbond;
 
     /* Line three in field file */
+    /**/
     real r_pi_pi;
+    /**/
     real p_lp2;
+    /**/
     real b_o_131;
+    /**/
     real b_o_132;
+    /**/
     real b_o_133;
 
     /* Line four in the field file */
+    /**/
     real p_ovun2;
+    /**/
     real p_val3;
+    /**/
     real valency_val;
+    /**/
     real p_val5;
+    /**/
     real rcore2;
+    /**/
     real ecore2;
+    /**/
     real acore2;
 } single_body_parameters;
 
 
-
-/* Two Body Parameters */
+/* 2-body parameters for a single interaction type,
+ * from the force field parameters file */
 typedef struct
 {
     /* Bond Order parameters */
-    real p_bo1, p_bo2, p_bo3, p_bo4, p_bo5, p_bo6;
-    real r_s, r_p, r_pp;  // r_o distances in BO formula
-    real p_boc3, p_boc4, p_boc5;
+    /**/
+    real p_bo1;
+    /**/
+    real p_bo2;
+    /**/
+    real p_bo3;
+    /**/
+    real p_bo4;
+    /**/
+    real p_bo5;
+    /**/
+    real p_bo6;
+    /**/
+    real r_s;
+    /**/
+    real r_p;
+    /**/
+    real r_pp;  // r_o distances in BO formula
+    /**/
+    real p_boc3;
+    /**/
+    real p_boc4;
+    /**/
+    real p_boc5;
 
     /* Bond Energy parameters */
-    real p_be1, p_be2;
-    real De_s, De_p, De_pp;
+    /**/
+    real p_be1;
+    /**/
+    real p_be2;
+    /**/
+    real De_s;
+    /**/
+    real De_p;
+    /**/
+    real De_pp;
 
     /* Over/Under coordination parameters */
+    /**/
     real p_ovun1;
 
     /* Van der Waal interaction parameters */
+    /**/
     real D;
+    /**/
     real alpha;
+    /**/
     real r_vdW;
+    /**/
     real gamma_w;
-    real rcore, ecore, acore;
-
-    /* electrostatic parameters */
-    real gamma; // note: this parameter is gamma^-3 and not gamma.
+    /**/
+    real rcore;
+    /**/
+    real ecore;
+    /**/
+    real acore;
+
+    /* electrostatic parameters,
+     * note: this parameter is gamma^-3 and not gamma */
+    real gamma;
 
-    real v13cor, ovc;
+    /**/
+    real v13cor;
+    /**/
+    real ovc;
 } two_body_parameters;
 
 
-
-/* 3-body parameters */
+/* 3-body parameters for a single interaction type,
+ * from the force field parameters file */
 typedef struct
 {
     /* valence angle */
@@ -532,26 +844,40 @@ typedef struct
 } three_body_parameters;
 
 
+/* three body interactions info. */
 typedef struct
 {
+    /* num. of three body parameters */
     int cnt;
+    /* collection of three body parameters, indexed by atomic types */
     three_body_parameters prm[MAX_3BODY_PARAM];
 } three_body_header;
 
 
-
-/* hydrogen-bond parameters */
+/* hydrogen bond parameters in force field parameters file */
 typedef struct
 {
-    real r0_hb, p_hb1, p_hb2, p_hb3;
+    /**/
+    real r0_hb;
+    /**/
+    real p_hb1;
+    /**/
+    real p_hb2;
+    /**/
+    real p_hb3;
 } hbond_parameters;
 
 
-
-/* 4-body parameters */
+/* 4-body parameters for a single interaction type,
+ * from the force field parameters file */
 typedef struct
 {
-    real V1, V2, V3;
+    /**/
+    real V1;
+    /**/
+    real V2;
+    /**/
+    real V3;
 
     /* torsion angle */
     real p_tor1;
@@ -561,344 +887,566 @@ typedef struct
 } four_body_parameters;
 
 
+/* four body interactions info. */
 typedef struct
 {
+    /* num. of four body parameters */
     int cnt;
+    /* collection of four body parameters, indexed by atomic types */
     four_body_parameters prm[MAX_4BODY_PARAM];
 } four_body_header;
 
 
+/* atomic interaction parameters */
 typedef struct
 {
+    /* num. of atom types, from force field parameters file */
     int num_atom_types;
 
-#ifndef HAVE_CUDA
-/*
-    global_parameters gp;
-    single_body_parameters *sbp;
-    two_body_parameters **tbp;
-    three_body_header ***thbp;
-    hbond_parameters ***hbp;
-    four_body_header ****fbp;*/
-
+    /* global simulation parameters, from force field parameters file */
     global_parameters gp;
+    /* simulation parameters for single body interactions */
     single_body_parameters *sbp;
+    /* simulation parameters for two body interactions */
     two_body_parameters *tbp; 
+    /* simulation parameters for three body interactions */
     three_body_header *thbp; 
+    /* simulation parameters for hydrogen bonding interactions */
     hbond_parameters *hbp; 
+    /* simulation parameters for four body interactions */
     four_body_header *fbp; 
 
-#else
-    global_parameters gp;
+#ifdef HAVE_CUDA
+    /* global simulation parameters (GPU), from force field parameters file */
     global_parameters d_gp;
-
-    single_body_parameters *sbp;
+    /* simulation parameters for single body interactions (GPU) */
     single_body_parameters *d_sbp;
-
-    two_body_parameters *tbp; //changed
-    two_body_parameters *d_tbp; //changed
-
-    three_body_header *thbp; //changed
-    three_body_header *d_thbp; //changed
-
-    hbond_parameters *hbp; //changed
-    hbond_parameters *d_hbp; //changed
-
-    four_body_header *fbp; //changed
-    four_body_header *d_fbp; //changed
+    /* simulation parameters for two body interactions (GPU) */
+    two_body_parameters *d_tbp;
+    /* simulation parameters for three body interactions (GPU) */
+    three_body_header *d_thbp;
+    /* simulation parameters for hydrogen bonding interactions (GPU) */
+    hbond_parameters *d_hbp;
+    /* simulation parameters for four body interactions (GPU) */
+    four_body_header *d_fbp;
 #endif
 } reax_interaction;
 
 
-
+/**/
 typedef struct
 {
-    int  orig_id;
-    int  imprt_id;
-    int  type;
-    char name[8];
-
-    rvec x; // position
-    rvec v; // velocity
-    rvec f; // force
+    /* atom serial number as given in the geo file */
+    int orig_id;
+    /* local atom ID on neighbor processor ??? */
+    int imprt_id;
+    /* non-negative integer used to indicate atom type,
+     * as identified by short element string in force field file (single
+     * body parameters section) */
+    int type;
+    /* atom name as given in the geo file */
+    char name[MAX_ATOM_NAME_LEN];
+
+    /* atomic position, 3D */
+    rvec x;
+    /* atomic velocity, 3D */
+    rvec v;
+    /* net force acting upon atom, 3D */
+    rvec f;
+    /* net force acting upon atom in previous time step, 3D */
     rvec f_old;
 
-    real q; // charge
-    rvec4 s; // they take part in
-    rvec4 t; // computing q
-
+    /* atomic charge, computed during coulombic interaction */
+    real q;
+    /* atomic fictitious charge used during QEq to compute atomic charge,
+     * multiple entries used to hold old values for extrapolation */
+    rvec4 s;
+    /* atomic fictitious charge used during QEq to compute atomic charge,
+     * multiple entries used to hold old values for extrapolation */
+    rvec4 t;
+
+    /* unique non-negative integer index of atom if it is a hydrogen atom,
+     * -1 otherwise */
     int Hindex;
+    /* num. bonds associated with atom */
     int num_bonds;
+    /* num. hydrogren bonds associated with atom */
     int num_hbonds;
+    /* ??? */
     int renumber;
 } reax_atom;
 
 
-
+/* Info. regarding 3D simulation space */
 typedef struct
 {
+    /* total volume */
     real V;
-    rvec min, max, box_norms;
-
-    rtensor box, box_inv;
-    rtensor trans, trans_inv;
+    /* min. coordinate of box in Angstroms, 3D */
+    rvec min;
+    /* max. coordinate of box in Angstroms, 3D */
+    rvec max;
+    /* length of each dimension of the simulation box in Angstroms, 3D */
+    rvec box_norms;
+
+    /* ??? */
+    rtensor box;
+    /* ??? */
+    rtensor box_inv;
+    /* ??? */
+    rtensor trans;
+    /* ??? */
+    rtensor trans_inv;
+    /* ??? */
     rtensor g;
 } simulation_box;
 
 
-
-struct grid_cell
+/**/
+typedef struct
 {
-#ifndef HAVE_CUDA
-/*
-    real cutoff;
-    rvec min, max;
-    ivec rel_box;
-
-    int  mark;
-    int  type;
-    int  str;
-    int  end;
-    int  top;
-    int* atoms;
-    struct grid_cell** nbrs;
-    ivec* nbrs_x;
-    rvec* nbrs_cp;
-*/
-
-   //real cutoff;
-   rvec min, max;
-   //ivec rel_box;
-
-   int  mark;
-   int  type;
-   //int  str;
-   //int  end;
-   int  top;
-   int* atoms;
-   //struct grid_cell** nbrs; //changed
-   //ivec* nbrs_x;
-   //rvec* nbrs_cp;
-
-
-#else
-    //real cutoff;
-    rvec min, max;
-    //ivec rel_box;
-
-    int  mark;
-    int  type;
-    //int  str;
-    //int  end;
-    int  top;
+    /* min. cell coordinate (top-left) */
+    rvec min;
+    /* max. cell coordinate (bottom-right) */
+    rvec max;
+ 
+    /* ??? */
+    int mark;
+    /* native or ghost cells (contains atoms only of resp. type) */
+    int type;
+    /* count of num. of atoms currently within this grid cell */
+    int top;
+    /* IDs of atoms within this grid cell */
     int* atoms;
-    //struct grid_cell** nbrs; //changed
-    //ivec* nbrs_x;
-    //rvec* nbrs_cp;
-#endif
-};
-
-typedef struct grid_cell grid_cell;
+} grid_cell;
 
 
+/* info. for 3D domain (i.e., spatial) partitioning of atoms
+ * inside the simulation box */
 typedef struct
 {
-    int  total, max_atoms, max_nbrs;
+    /* total number of grid cells (native AND ghost) */
+    int total;
+    /* max. num. of atoms with a grid cell can contain */
+    int max_atoms;
+    /**/
+    int max_nbrs;
+    /* num. of grid cells in each dimension, 3D */
     ivec ncells;
+    /* lengths of each grid cell dimension, 3D */
     rvec cell_len;
+    /* multiplicative inverses of lengths of each grid cell dimension, 3D */
     rvec inv_len;
 
+    /* bond interaction cutoff in terms of num. of grid cells in each dimension, 3D */
     ivec bond_span;
+    /* non-bonded interaction cutoff in terms of num. of grid cells in each dimension, 3D */
     ivec nonb_span;
+    /* Verlet list (i.e., neighbor list) cutoff in terms of num. of grid cells in each dimension, 3D */
     ivec vlist_span;
 
+    /* partitioning of ??? */
     ivec native_cells;
+    /**/
     ivec native_str;
+    /**/
     ivec native_end;
 
+    /**/
     real ghost_cut;
+    /**/
     ivec ghost_span;
+    /**/
     ivec ghost_nonb_span;
+    /**/
     ivec ghost_hbond_span;
+    /**/
     ivec ghost_bond_span;
 
-#ifndef HAVE_CUDA
-//    grid_cell*** cells;
-//    ivec *order;
-   
-   grid_cell* cells;
-   ivec *order;
-
-   int *str;
-   int *end;
-   real *cutoff;
-   ivec *nbrs_x;
-   rvec *nbrs_cp;
-
-   ivec *rel_box;
-
-
-
-#else
-    grid_cell* cells; //changed
+    /**/
+    grid_cell* cells;
+    /**/
     ivec *order;
-
-    //GRID
+ 
+    /**/
     int *str;
+    /**/
     int *end;
+    /**/
     real *cutoff;
+    /* rel. positions of cells which fall within neighbor cut-off of a given cell */
     ivec *nbrs_x;
+    /* corner points of cells which fall within neighbor cut-off of a given cell */
     rvec *nbrs_cp;
-
+ 
+    /**/
     ivec *rel_box;
-#endif
 } grid;
 
 
+/**/
 typedef struct
 {
-    int  rank;
-    int  est_send, est_recv;
-    int  atoms_str, atoms_cnt;
-    ivec rltv, prdc;
-    rvec bndry_min, bndry_max;
-
+    /**/
+    int rank;
+    /**/
+    int est_send;
+    /**/
+    int est_recv;
+    /**/
+    int atoms_str;
+    /**/
+    int atoms_cnt;
+    /**/
+    ivec rltv;
+    /**/
+    ivec prdc;
+    /**/
+    rvec bndry_min;
+    /**/
+    rvec bndry_max;
+
+    /**/
     int  send_type;
+    /**/
     int  recv_type;
+    /**/
     ivec str_send;
+    /**/
     ivec end_send;
+    /**/
     ivec str_recv;
+    /**/
     ivec end_recv;
 } neighbor_proc;
 
 
-
+/**/
 typedef struct
 {
+    /**/
     int N;
+    /**/
     int exc_gcells;
+    /**/
     int exc_atoms;
 } bound_estimate;
 
 
-
+/**/
 typedef struct
 {
+    /**/
     real ghost_nonb;
+    /**/
     real ghost_hbond;
+    /**/
     real ghost_bond;
+    /**/
     real ghost_cutoff;
 } boundary_cutoff;
 
 
-
+/**/
 typedef struct
 {
+    /* atomic interaction parameters */
     reax_interaction reax_param;
 
-    int              n, N, bigN, numH;
-    int              local_cap, total_cap, gcell_cap, Hcap;
-    int              est_recv, est_trans, max_recved;
-    int              wsize, my_rank, num_nbrs;
-    ivec             my_coords;
-    neighbor_proc    my_nbrs[MAX_NBRS];
-    int             *global_offset;
-
-    simulation_box   big_box, my_box, my_ext_box;
-    simulation_box   *d_big_box, *d_my_box, *d_my_ext_box;
-
-    grid             my_grid;
-    grid             d_my_grid;
-
-    boundary_cutoff  bndry_cuts;
-
-    reax_atom       *my_atoms;
-    reax_atom       *d_my_atoms;
-
-    /*CUDA-specific*/
-    int                   max_sparse_entries;
-    int               init_thblist;
-    int                   num_thbodies;
-
-    int                   max_bonds;
-    int                   max_hbonds;
+    /* num. atoms (locally owned) within spatial domain of MPI process */
+    int n;
+    /* num. atoms (locally owned AND ghost region) within spatial domain of MPI process */
+    int N;
+    /* num. atoms within simulation */
+    int bigN;
+    /* dimension of sparse charge method matrix */
+    int N_cm;
+    /* num. hydrogen atoms */
+    int numH;
+    /* num. hydrogen atoms (GPU) */
+    int *d_numH;
+    /**/
+    int local_cap;
+    /**/
+    int total_cap;
+    /**/
+    int gcell_cap;
+    /**/
+    int Hcap;
+    /**/
+    int est_recv;
+    /**/
+    int est_trans;
+    /**/
+    int max_recved;
+    /**/
+    int my_rank;
+    /**/
+    int num_nbrs;
+    /* coordinates of processor (according to rank) in MPI cartesian topology */
+    ivec my_coords;
+    /* list of neighbor processors */
+    neighbor_proc my_nbrs[MAX_NBRS];
+
+    /* global simulation box */
+    simulation_box big_box;
+    /* local simulation box of owned atoms per processor */
+    simulation_box my_box;
+    /* local simulation box of owned AND ghost atoms per processor */
+    simulation_box my_ext_box;
+    /* global simulation box (GPU) */
+    simulation_box *d_big_box;
+    /* local simulation box of owned atoms per processor (GPU) */
+    simulation_box *d_my_box;
+    /* local simulation box of owned AND ghost atoms per processor (GPU) */
+    simulation_box *d_my_ext_box;
+
+    /* grid specifying domain (i.e., spatial) decompisition
+     * of atoms within simulation box */
+    grid my_grid;
+    /* grid specifying domain (i.e., spatial) decompisition
+     * of atoms within simulation box (GPU) */
+    grid d_my_grid;
+
+    /* boundary cutoffs, in ??? */
+    boundary_cutoff bndry_cuts;
+
+    /* collection of atomic info. */
+    reax_atom *my_atoms;
+    /* collection of atomic info. (GPU) */
+    reax_atom *d_my_atoms;
+
+    /* current num. of far neighbors per atom */
+    int *far_nbrs;
+    /* current num. of far neighbors per atom (GPU) */
+    int *d_far_nbrs;
+    /* max num. of far neighbors per atom */
+    int *max_far_nbrs;
+    /* max num. of far neighbors per atom (GPU) */
+    int *d_max_far_nbrs;
+    /* total num. of (max) far neighbors across all atoms */
+    int total_far_nbrs;
+    /* total num. of (max) far neighbors across all atoms (GPU) */
+    int *d_total_far_nbrs;
+    /* TRUE if far neighbors list requires reallocation,
+     * FALSE otherwise (GPU) */
+    int *d_realloc_far_nbrs;
+
+    /* num. bonds per atom */
+    int *bonds;
+    /* num. bonds per atom (GPU) */
+    int *d_bonds;
+    /* max. num. bonds per atom */
+    int *max_bonds;
+    /* max. num. bonds per atom (GPU) */
+    int *d_max_bonds;
+    /* total num. bonds (sum over max) */
+    int total_bonds;
+    /* total num. bonds (sum over max) (GPU) */
+    int *d_total_bonds;
+    /* TRUE if bonds list requires reallocation, FALSE otherwise (GPU) */
+    int *d_realloc_bonds;
+
+    /* num. hydrogen bonds per atom */
+    int *hbonds;
+    /* num. hydrogen bonds per atom (GPU) */
+    int *d_hbonds;
+    /* max. num. hydrogen bonds per atom */
+    int max_hbonds;
+    //int *max_hbonds;
+    /* max. num. hydrogen bonds per atom (GPU) */
+    int *d_max_hbonds;
+    /* total num. hydrogen bonds (sum over max) */
+    int total_hbonds;
+    /* total num. hydrogen bonds (sum over max) (GPU) */
+    int *d_total_hbonds;
+    /* TRUE if hydrogen bonds list requires reallocation, FALSE otherwise (GPU) */
+    int *d_realloc_hbonds;
+
+    /* num. matrix entries per row (GPU) */
+    int *d_cm_entries;
+    /* max. num. matrix entries per row (GPU) */
+    int *d_max_cm_entries;
+    /* total num. matrix entries (sum over max) */
+    int total_cm_entries;
+    /* total num. matrix entries (sum over max) (GPU) */
+    int *d_total_cm_entries;
+    /* TRUE if charge matrix requires reallocation, FALSE otherwise (GPU) */
+    int *d_realloc_cm_entries;
+
+    /* total num. three body list indices */
+    int total_thbodies_indices;
+    /* total num. three body interactions */
+    int total_thbodies;
+    /* total num. three body interactions (GPU) */
+    int *d_total_thbodies;
 } reax_system;
 
 
-
 /* system control parameters */
 typedef struct
 {
+    /* simulation name, as supplied via control file */
     char sim_name[MAX_STR];
-    int  nprocs;
-    int  gpus_per_node;
+    /* number of MPI processors, as supplied via control file */
+    int nprocs;
+    /* number of GPUs per node, as supplied via control file */
+    int gpus_per_node;
+    /* MPI processors per each simulation dimension (cartesian topology),
+     * as supplied via control file */
     ivec procs_by_dim;
-    /* ensemble values:
-       0 : NVE
-       1 : bNVT (Berendsen)
-       2 : nhNVT (Nose-Hoover)
-       3 : sNPT (Parrinello-Rehman-Nose-Hoover) semiisotropic
-       4 : iNPT (Parrinello-Rehman-Nose-Hoover) isotropic
-       5 : NPT  (Parrinello-Rehman-Nose-Hoover) Anisotropic*/
-    int  ensemble;
-    int  nsteps;
+    /* ensemble type for simulation, values:
+     * 0 : NVE
+     * 1 : bNVT (Berendsen)
+     * 2 : nhNVT (Nose-Hoover)
+     * 3 : sNPT (Parrinello-Rehman-Nose-Hoover) semiisotropic
+     * 4 : iNPT (Parrinello-Rehman-Nose-Hoover) isotropic
+     * 5 : NPT  (Parrinello-Rehman-Nose-Hoover) Anisotropic */
+    int ensemble;
+    /* num. of simulation time steps */
+    int nsteps;
+    /* length of time step, in femtoseconds */
     real dt;
-    int  geo_format;
-    int  restart;
-
-    int  restrict_bonds;
-    int  remove_CoM_vel;
-    int  random_vel;
-    int  reposition_atoms;
-
-    int  reneighbor;
+    /* format of geometry input file */
+    int geo_format;
+    /* format of restart file */
+    int restart;
+
+    /**/
+    int restrict_bonds;
+    /* flag to control if center of mass velocity is removed */
+    int remove_CoM_vel;
+    /* flag to control if atomic initial velocity is randomly assigned */
+    int random_vel;
+    /* flag to control how atom repositioning is performed, values:
+     * 0: fit to periodic box
+     * 1: put center of mass to box center
+     * 2: put center of mass to box origin  */
+    int reposition_atoms;
+
+    /* flag to control the frequency (in terms of simulation time stesp)
+     * at which atom reneighboring is performed */
+    int reneighbor;
+    /* far neighbor (Verlet list) interaction cutoff, in Angstroms */
     real vlist_cut;
+    /* bond interaction cutoff, in Angstroms */
     real bond_cut;
-    real nonb_cut, nonb_low;
+    /* non-bonded interaction cutoff, in Angstroms */
+    real nonb_cut;
+    /* ???, as supplied by force field parameters, in Angstroms */
+    real nonb_low;
+    /* hydrogen bond interaction cutoff, in Angstroms */
     real hbond_cut;
+    /* ghost region cutoff (user-supplied via control file), in Angstroms */
     real user_ghost_cut;
 
+    /* bond graph cutoff, as supplied by control file, in Angstroms */
     real bg_cut;
+    /* bond order cutoff, as supplied by force field parameters, in Angstroms */
     real bo_cut;
+    /* three body interaction cutoff, as supplied by control file, in Angstroms */
     real thb_cut;
 
+    /* flag to control if force computations are tablulated */
     int tabulate;
 
-    int qeq_freq;
-    real q_err;
-    int refactor;
-    real droptol;
-
-    real T_init, T_final, T;
+    /**/
+    unsigned int charge_method;
+    /* frequency (in terms of simulation time steps) at which to
+     * re-compute atomic charge distribution */
+    int charge_freq;
+    /**/
+    unsigned int cm_solver_type;
+    /**/
+    real cm_q_net;
+    /**/
+    unsigned int cm_solver_max_iters;
+    /**/
+    unsigned int cm_solver_restart;
+    /* error tolerance of solution produced by charge distribution
+     * sparse iterative linear solver */
+    real cm_solver_q_err;
+    /**/
+    real cm_domain_sparsity;
+    /**/
+    unsigned int cm_domain_sparsify_enabled;
+    /**/
+    unsigned int cm_solver_pre_comp_type;
+    /* frequency (in terms of simulation time steps) at which to recompute
+     * incomplete factorizations */
+    unsigned int cm_solver_pre_comp_refactor;
+    /* drop tolerance of incomplete factorization schemes (ILUT, ICHOLT, etc.)
+     * used for preconditioning the iterative linear solver used in charge distribution */
+    real cm_solver_pre_comp_droptol;
+    /**/
+    unsigned int cm_solver_pre_comp_sweeps;
+    /**/
+    unsigned int cm_solver_pre_app_type;
+    /**/
+    unsigned int cm_solver_pre_app_jacobi_iters;
+
+    /* initial temperature of simulation, in Kelvin */
+    real T_init;
+    /* final temperature of simulation, in Kelvin */
+    real T_final;
+    /* current temperature of simulation, in Kelvin */
+    real T;
+    /**/
     real Tau_T;
+    /**/
     int  T_mode;
-    real T_rate, T_freq;
+    /**/
+    real T_rate;
+    /**/
+    real T_freq;
 
+    /**/
     int  virial;
-    rvec P, Tau_P, Tau_PT;
-    int  press_mode;
+    /**/
+    rvec P;
+    /**/
+    rvec Tau_P;
+    /**/
+    rvec Tau_PT;
+    /**/
+    int press_mode;
+    /**/
     real compressibility;
 
-    int  molecular_analysis;
-    int  num_ignored;
-    int  ignore[MAX_ATOM_TYPES];
-
-    int  dipole_anal;
-    int  freq_dipole_anal;
-    int  diffusion_coef;
-    int  freq_diffusion_coef;
-    int  restrict_type;
-
+    /**/
+    int molecular_analysis;
+    /**/
+    int num_ignored;
+    /**/
+    int ignore[MAX_ATOM_TYPES];
+
+    /**/
+    int dipole_anal;
+    /**/
+    int freq_dipole_anal;
+    /**/
+    int diffusion_coef;
+    /**/
+    int freq_diffusion_coef;
+    /**/
+    int restrict_type;
+
+    /* control parameters (GPU) */
     void *d_control_params;
 } control_params;
 
 
 typedef struct
 {
+    /**/
     real T;
+    /**/
     real xi;
+    /**/
     real v_xi;
+    /**/
     real v_xi_old;
+    /**/
     real G_xi;
 
 } thermostat;
@@ -906,10 +1454,15 @@ typedef struct
 
 typedef struct
 {
+    /**/
     real P;
+    /**/
     real eps;
+    /**/
     real v_eps;
+    /**/
     real v_eps_old;
+    /**/
     real a_eps;
 
 } isotropic_barostat;
@@ -917,17 +1470,27 @@ typedef struct
 
 typedef struct
 {
+    /**/
     rtensor P;
+    /**/
     real P_scalar;
 
+    /**/
     real eps;
+    /**/
     real v_eps;
+    /**/
     real v_eps_old;
+    /**/
     real a_eps;
 
+    /**/
     rtensor h0;
+    /**/
     rtensor v_g0;
+    /**/
     rtensor v_g0_old;
+    /**/
     rtensor a_g0;
 
 } flexible_barostat;
@@ -935,476 +1498,862 @@ typedef struct
 
 typedef struct
 {
+    /* start time of event */
     real start;
+    /* end time of event */
     real end;
+    /* total elapsed time of event */
     real elapsed;
 
+    /* total simulation time */
     real total;
+    /* communication time */
     real comm;
+    /* neighbor (i.e., Verlet) list generation time */
     real nbrs;
+    /* force initialization time */
     real init_forces;
+    /* bonded force calculation time */
     real bonded;
+    /* non-bonded force calculation time */
     real nonb;
-    real qEq;
-    int  s_matvecs;
-    int  t_matvecs;
+    /* atomic charge distribution calculation time */
+    real cm;
+    /* num. of steps in iterative linear solver for charge distribution (QEq, first solve) */
+    int s_matvecs;
+    /* num. of steps in iterative linear solver for charge distribution (QEq, second solve) */
+    int t_matvecs;
+    /* num. of retries in main sim. loop */
+    int num_retries;
 } reax_timing;
 
 
 typedef struct
 {
+    /* total energy */
     real e_tot;
-    real e_kin;                      // Total kinetic energy
+    /* total kinetic energy */
+    real e_kin;
+    /* total potential energy */
     real e_pot;
 
-    real e_bond;                     // Total bond energy
-    real e_ov;                       // Total over coordination
-    real e_un;                       // Total under coordination energy
-    real e_lp;                       // Total under coordination energy
-    real e_ang;                      // Total valance angle energy
-    real e_pen;                      // Total penalty energy
-    real e_coa;                      // Total three body conjgation energy
-    real e_hb;                       // Total Hydrogen bond energy
-    real e_tor;                      // Total torsional energy
-    real e_con;                      // Total four body conjugation energy
-    real e_vdW;                      // Total van der Waals energy
-    real e_ele;                      // Total electrostatics energy
-    real e_pol;                      // Polarization energy
+    /* total bond energy */
+    real e_bond;
+    /* total over coordination */
+    real e_ov;
+    /* total under coordination energy */
+    real e_un;
+    /* total under coordination energy */
+    real e_lp;
+    /* total valance angle energy */
+    real e_ang;
+    /* total penalty energy */
+    real e_pen;
+    /* total three body conjugation energy */
+    real e_coa;
+    /* total Hydrogen bond energy */
+    real e_hb;
+    /* total torsional energy */
+    real e_tor;
+    /* total four body conjugation energy */
+    real e_con;
+    /* total van der Waals energy */
+    real e_vdW;
+    /* total electrostatics energy */
+    real e_ele;
+    /* polarization energy */
+    real e_pol;
 } energy_data;
 
+
+/**/
 typedef struct
 {
-    int  step;
-    int  prev_steps;
+    /**/
+    int step;
+    /**/
+    int prev_steps;
+    /**/
     real time;
 
+    /**/
     real M;              // Total Mass
+    /**/
     real inv_M;                      // 1 / Total Mass
 
+    /**/
     rvec xcm;                        // Center of mass
+    /**/
     rvec vcm;                        // Center of mass velocity
+    /**/
     rvec fcm;                        // Center of mass force
+    /**/
     rvec amcm;                       // Angular momentum of CoM
+    /**/
     rvec avcm;                       // Angular velocity of CoM
+    /**/
     real etran_cm;                   // Translational kinetic energy of CoM
+    /**/
     real erot_cm;                    // Rotational kinetic energy of CoM
 
+    /**/
     rtensor kinetic;                 // Kinetic energy tensor
+    /**/
     rtensor virial;                  // Hydrodynamic virial
 
+    /**/
     energy_data my_en;
+    /**/
     energy_data sys_en;
 
-    real               N_f;          //Number of degrees of freedom
-    rvec               t_scale;
-    rtensor            p_scale;
-    thermostat         therm;        // Used in Nose_Hoover method
+    /**/
+    real N_f;          //Number of degrees of freedom
+    /**/
+    rvec t_scale;
+    /**/
+    rtensor p_scale;
+    /**/
+    thermostat therm;        // Used in Nose_Hoover method
+    /**/
     isotropic_barostat iso_bar;
-    flexible_barostat  flex_bar;
-    real               inv_W;
+    /**/
+    flexible_barostat flex_bar;
+    /**/
+    real inv_W;
 
+    /**/
     real kin_press;
+    /**/
     rvec int_press;
+    /**/
     rvec my_ext_press;
+    /**/
     rvec ext_press;
+    /**/
     rvec tot_press;
 
+    /**/
     reax_timing timing;
+    /**/
     reax_timing d_timing;
 
+    /**/
     void *d_simulation_data;
 } simulation_data;
 
 
+/**/
 typedef struct
 {
+    /**/
     int thb;
-    int pthb; // pointer to the third body on the central atom's nbrlist
-    real theta, cos_theta;
-    rvec dcos_di, dcos_dj, dcos_dk;
+    /* pointer to the third body on the central atom's nbrlist */
+    int pthb;
+    /**/
+    real theta;
+    /**/
+    real cos_theta;
+    /**/
+    rvec dcos_di;
+    /**/
+    rvec dcos_dj;
+    /**/
+    rvec dcos_dk;
 } three_body_interaction_data;
 
 
+/* info. about a far neighbor to an atom */
 typedef struct
 {
+    /**/
     int nbr;
+    /**/
     ivec rel_box;
+    /**/
     real d;
+    /**/
     rvec dvec;
-
 } far_neighbor_data;
 
 
+/**/
 typedef struct
 {
+    /**/
     int nbr;
+    /**/
     int scl;
+    /**/
     far_neighbor_data *ptr;
 
     /*CUDA-specific*/
+    /**/
     int sym_index;
+    /**/
     rvec hb_f;
 } hbond_data;
 
 
+/**/
 typedef struct
 {
+    /**/
     int wrt;
+    /**/
     rvec dVal;
 } dDelta_data;
 
 
+/**/
 typedef struct
 {
+    /**/
     int wrt;
-    rvec dBO, dBOpi, dBOpi2;
+    /**/
+    rvec dBO;
+    /**/
+    rvec dBOpi;
+    /**/
+    rvec dBOpi2;
 } dbond_data;
 
+
+/**/
 typedef struct
 {
-    real BO, BO_s, BO_pi, BO_pi2;
-    real Cdbo, Cdbopi, Cdbopi2;
-    real C1dbo, C2dbo, C3dbo;
-    real C1dbopi, C2dbopi, C3dbopi, C4dbopi;
-    real C1dbopi2, C2dbopi2, C3dbopi2, C4dbopi2;
-    rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+    /**/
+    real BO;
+    /**/
+    real BO_s;
+    /**/
+    real BO_pi;
+    /**/
+    real BO_pi2;
+    /**/
+    real Cdbo;
+    /**/
+    real Cdbopi;
+    /**/
+    real Cdbopi2;
+    /**/
+    real C1dbo;
+    /**/
+    real C2dbo;
+    /**/
+    real C3dbo;
+    /**/
+    real C1dbopi;
+    /**/
+    real C2dbopi;
+    /**/
+    real C3dbopi;
+    /**/
+    real C4dbopi;
+    /**/
+    real C1dbopi2;
+    /**/
+    real C2dbopi2;
+    /**/
+    real C3dbopi2;
+    /**/
+    real C4dbopi2;
+    /**/
+    rvec dBOp;
+    /**/
+    rvec dln_BOp_s;
+    /**/
+    rvec dln_BOp_pi;
+    /**/
+    rvec dln_BOp_pi2;
 } bond_order_data;
 
+
+/**/
 typedef struct
 {
+    /**/
     int nbr;
+    /**/
     int sym_index;
+    /**/
     int dbond_index;
+    /**/
     ivec rel_box;
     //  rvec ext_factor;
+    /**/
     real d;
+    /**/
     rvec dvec;
+    /**/
     bond_order_data bo_data;
 
     /*CUDA-specific*/
+    /**/
     real ae_CdDelta;
 
+    /**/
     real va_CdDelta;
+    /**/
     rvec va_f;
 
+    /**/
     real ta_CdDelta;
+    /**/
     real ta_Cdbo;
+    /**/
     rvec ta_f;
 
+    /**/
     rvec hb_f;
 
+    /**/
     rvec tf_f;
 } bond_data;
 
 
+/* Secondary structure for matrix in CRS format */
 typedef struct
 {
+    /* column index for corresponding matrix entry */
     int j;
+    /* matrix entry */
     real val;
 } sparse_matrix_entry;
 
+
+/* Matrix in compressed row storage (CRS) format,
+ * with modifications for row end pointer and max entries per row (CUDA optimizations).
+ * See, e.g.,
+ *   http://netlib.org/linalg/html_templates/node91.html#SECTION00931100000000000000
+ */
 typedef struct
 {
-    int cap, n, m;
-    int *start, *end;
+    /* number of rows */
+    int n;
+    /* number of nonzeros (NNZ) ALLOCATED */
+    int m;
+    /* row start pointer (last element contains ACTUAL NNZ) */
+    int *start;
+    /* row end pointer */
+    int *end;
+    /* secondary structure for matrix entry info */
     sparse_matrix_entry *entries;
 } sparse_matrix;
 
 
+/* used to determine if and how much space should be reallocated */
 typedef struct
 {
-    int num_far;
-    int H, Htop;
-    int hbonds, num_hbonds;
-    int bonds, num_bonds;
+    /* TRUE if far neighbor list needs
+     * to be reallocated, FALSE otherwise */
+    int far_nbrs;
+    /* TRUE if charge matrix needs
+     * to be reallocated, FALSE otherwise */
+    int cm;
+    /**/
+    int Htop;
+    /**/
+    int hbonds;
+    /**/
+    int num_hbonds;
+    /* TRUE if bonds list needs
+     * to be reallocated, FALSE otherwise */
+    int bonds;
+    /**/
+    int num_bonds;
+    /* TRUE if three body list needs
+     * to be reallocated, FALSE otherwise */
+    int thbody;
+    /**/
     int num_3body;
+    /**/
     int gcell_atoms;
 } reallocate_data;
 
 
 typedef struct
 {
+    /* 0 if struct members are NOT allocated, 1 otherwise */
     int allocated;
 
     /* communication storage */
+    /**/
     real *tmp_dbl[MAX_NBRS];
+    /**/
     rvec *tmp_rvec[MAX_NBRS];
+    /**/
     rvec2 *tmp_rvec2[MAX_NBRS];
+    /**/
     int  *within_bond_box;
 
     /* bond order related storage */
+    /**/
     real *total_bond_order;
-    real *Deltap, *Deltap_boc;
-    real *Delta, *Delta_lp, *Delta_lp_temp, *Delta_e, *Delta_boc;
-    real *dDelta_lp, *dDelta_lp_temp;
-    real *nlp, *nlp_temp, *Clp, *vlpex;
+    /**/
+    real *Deltap;
+    /**/
+    real *Deltap_boc;
+    /**/
+    real *Delta;
+    /**/
+    real *Delta_lp;
+    /**/
+    real *Delta_lp_temp;
+    /**/
+    real *Delta_e;
+    /**/
+    real *Delta_boc;
+    /**/
+    real *dDelta_lp;
+    /**/
+    real *dDelta_lp_temp;
+    /**/
+    real *nlp;
+    /**/
+    real *nlp_temp;
+    /**/
+    real *Clp;
+    /**/
+    real *vlpex;
+    /**/
     rvec *dDeltap_self;
-    int *bond_mark, *done_after;
-
-    /* QEq storage */
-#ifndef HAVE_CUDA
-//    sparse_matrix *H, *L, *U;
-    sparse_matrix H, L, U;
-#else
-    sparse_matrix H, L, U; //CHANGED
-#endif
-    real *Hdia_inv, *b_s, *b_t, *b_prc, *b_prm, *s, *t;
+    /**/
+    int *bond_mark;
+    /**/
+    int *done_after;
+
+    /* charge matrix storage */
+    /* charge matrix */
+    sparse_matrix H;
+    /* preconditioner */
+    sparse_matrix L;
+    /* preconditioner */
+    sparse_matrix U;
+    /* preconditioner */
+    real *Hdia_inv;
+    /**/
+    real *b_s;
+    /**/
+    real *b_t;
+    /**/
+    real *b_prc;
+    /**/
+    real *b_prm;
+    /**/
+    real *s;
+    /**/
+    real *t;
+    /**/
     real *droptol;
-    rvec2 *b, *x;
+    /**/
+    rvec2 *b;
+    /**/
+    rvec2 *x;
 
     /* GMRES storage */
-    real *y, *z, *g;
-    real *hc, *hs;
-#ifndef HAVE_CUDA
-    //real **h, **v;
-    real *h, *v;
-#else
-    real *h, *v; //changed
-#endif
+    /**/
+    real *y;
+    /**/
+    real *z;
+    /**/
+    real *g;
+    /**/
+    real *hc;
+    /**/
+    real *hs;
+    /**/
+    real *h;
+    /**/
+    real *v;
+
     /* CG storage */
-    real *r, *d, *q, *p;
-    rvec2 *r2, *d2, *q2, *p2;
+    /**/
+    real *r;
+    /**/
+    real *d;
+    /**/
+    real *q;
+    /**/
+    real *p;
+    /**/
+    rvec2 *r2;
+    /**/
+    rvec2 *d2;
+    /**/
+    rvec2 *q2;
+    /**/
+    rvec2 *p2;
+
     /* Taper */
-    real Tap[8]; //Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
-    real d_Tap;
+    /* Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0 */
+    real Tap[8];
 
     /* storage for analysis */
-    int  *mark, *old_mark;
+    /**/
+    int *mark;
+    /**/
+    int *old_mark;
+    /**/
     rvec *x_old;
 
     /* storage space for bond restrictions */
-    int  *restricted;
-#ifndef HAVE_CUDA
-    //int **restricted_list;
-    int * restricted_list;
-#else
-    int *restricted_list;   //changed
-#endif
+    /**/
+    int *restricted;
+    /**/
+    int *restricted_list;
 
     /* integrator */
+    /**/
     rvec *v_const;
 
     /* force calculations */
+    /**/
     real *CdDelta;  // coefficient of dDelta
+    /**/
     rvec *f;
 #ifdef TEST_FORCES
+    /**/
     rvec *f_ele;
+    /**/
     rvec *f_vdw;
+    /**/
     rvec *f_bo;
+    /**/
     rvec *f_be;
+    /**/
     rvec *f_lp;
+    /**/
     rvec *f_ov;
+    /**/
     rvec *f_un;
+    /**/
     rvec *f_ang;
+    /**/
     rvec *f_coa;
+    /**/
     rvec *f_pen;
+    /**/
     rvec *f_hb;
+    /**/
     rvec *f_tor;
+    /**/
     rvec *f_con;
+    /**/
     rvec *f_tot;
+    /**/
     rvec *dDelta;   // calculated on the fly in bond_orders.c together with bo'
 
+    /**/
     int  *rcounts;
+    /**/
     int  *displs;
+    /**/
     int  *id_all;
+    /**/
     rvec *f_all;
 #endif
 
+    /**/
     reallocate_data realloc;
-    //int *num_bonds;
-    /* hydrogen bonds */
-    //int   num_H, Hcap;
-    //int  *Hindex;
-    //int *num_hbonds;
-    //int *hash;
-    //int *rev_hash;
 } storage;
 
 
+/* Union used for determining interaction list type */
 typedef union
 {
-
-//#ifdef HAVE_CUDA
+    /* void type */
     void *v;
+    /* three body type */
     three_body_interaction_data *three_body_list;
-    bond_data          *bond_list;
-    dbond_data         *dbo_list;
-    dDelta_data        *dDelta_list;
-    far_neighbor_data  *far_nbr_list;
-    hbond_data         *hbond_list;
-//#endif
-
+    /* bond type */
+    bond_data *bond_list;
+    /* derivative bond order type */
+    dbond_data *dbo_list;
+    /* derivative delta type */
+    dDelta_data *dDelta_list;
+    /* far neighbor type */
+    far_neighbor_data *far_nbr_list;
+    /* hydrogen bond type */
+    hbond_data *hbond_list;
 } list_type;
 
 
+/* Interaction list */
 typedef struct
 {
+    /* 0 if struct members are NOT allocated, 1 otherwise */
     int allocated;
 
+    /* total num. of entities, each of which correspond to one of more interactions */
     int n;
+    /* total num. of interactions */
     int num_intrs;
 
+    /* beginning position for interactions corresponding to a particular entity,
+     * where the entity ID used for indexing is an integer between 0 and n - 1, inclusive */
     int *index;
+    /* ending position for interactions corresponding to a particular entity,
+     * where the entity ID used for indexing is an integer between 0 and n - 1, inclusive */
     int *end_index;
 
+    /* interaction list type, as defined by interactions enum above */
     int type;
+    /* interaction list, made purposely non-opaque via above union to avoid typecasts */
     list_type select;
-/*
-#ifndef HAVE_CUDA
-    void *v;
-    three_body_interaction_data *three_body_list;
-    bond_data          *bond_list;
-    dbond_data         *dbo_list;
-    dDelta_data        *dDelta_list;
-    far_neighbor_data  *far_nbr_list;
-    hbond_data         *hbond_list;
-#endif
-*/
-
 } reax_list;
 
 
+/**/
 typedef struct
 {
 #if defined(PURE_REAX)
+    /**/
     MPI_File trj;
 #endif
+    /**/
     FILE *strj;
-    int   trj_offset;
-    int   atom_line_len;
-    int   bond_line_len;
-    int   angle_line_len;
-    int   write_atoms;
-    int   write_bonds;
-    int   write_angles;
+    /**/
+    int trj_offset;
+    /**/
+    int atom_line_len;
+    /**/
+    int bond_line_len;
+    /**/
+    int angle_line_len;
+    /**/
+    int write_atoms;
+    /**/
+    int write_bonds;
+    /**/
+    int write_angles;
+    /**/
     char *line;
-    int   buffer_len;
+    /**/
+    int buffer_len;
+    /**/
     char *buffer;
 
+    /**/
     FILE *out;
+    /**/
     FILE *pot;
+    /**/
     FILE *log;
-    FILE *mol, *ign;
+    /**/
+    FILE *mol;
+    /**/
+    FILE *ign;
+    /**/
     FILE *dpl;
+    /**/
     FILE *drft;
+    /**/
     FILE *pdb;
+    /**/
     FILE *prs;
 
-    int   write_steps;
-    int   traj_compress;
-    int   traj_method;
-    char  traj_title[81];
-    int   atom_info;
-    int   bond_info;
-    int   angle_info;
-
-    int   restart_format;
-    int   restart_freq;
-    int   debug_level;
-    int   energy_update_freq;
+    /**/
+    int write_steps;
+    /**/
+    int traj_compress;
+    /**/
+    int traj_method;
+    /**/
+    char traj_title[81];
+    /**/
+    int atom_info;
+    /**/
+    int bond_info;
+    /**/
+    int angle_info;
+
+    /**/
+    int restart_format;
+    /**/
+    int restart_freq;
+    /**/
+    int debug_level;
+    /**/
+    int energy_update_freq;
 
 #ifdef TEST_ENERGY
+    /**/
     FILE *ebond;
-    FILE *elp, *eov, *eun;
-    FILE *eval, *epen, *ecoa;
+    /**/
+    FILE *elp;
+    /**/
+    FILE *eov;
+    /**/
+    FILE *eun;
+    /**/
+    FILE *eval;
+    /**/
+    FILE *epen;
+    /**/
+    FILE *ecoa;
+    /**/
     FILE *ehb;
-    FILE *etor, *econ;
-    FILE *evdw, *ecou;
+    /**/
+    FILE *etor;
+    /**/
+    FILE *econ;
+    /**/
+    FILE *evdw;
+    /**/
+    FILE *ecou;
 #endif
 
 #ifdef TEST_FORCES
-    FILE *fbo, *fdbo;
+    /**/
+    FILE *fbo;
+    /**/
+    FILE *fdbo;
+    /**/
     FILE *fbond;
-    FILE *flp, *fov, *fun;
-    FILE *fang, *fcoa, *fpen;
+    /**/
+    FILE *flp;
+    /**/
+    FILE *fov;
+    /**/
+    FILE *fun;
+    /**/
+    FILE *fang;
+    /**/
+    FILE *fcoa;
+    /**/
+    FILE *fpen;
+    /**/
     FILE *fhb;
-    FILE *ftor, *fcon;
-    FILE *fvdw, *fele;
-    FILE *ftot, *fcomp;
+    /**/
+    FILE *ftor;
+    /**/
+    FILE *fcon;
+    /**/
+    FILE *fvdw;
+    /**/
+    FILE *fele;
+    /**/
+    FILE *ftot;
+    /**/
+    FILE *fcomp;
 #endif
 
 #if defined(TEST_ENERGY) || defined(TEST_FORCES)
-    FILE *flist; // far neighbor list
-    FILE *blist; // bond list
-    FILE *nlist; // near neighbor list
+    /* far neighbor list */
+    FILE *flist;
+    /* bond list */
+    FILE *blist;
+    /* near neighbor list */
+    FILE *nlist;
 #endif
 } output_controls;
 
 
+/**/
 typedef struct
 {
+    /**/
     int atom_count;
+    /**/
     int atom_list[MAX_MOLECULE_SIZE];
+    /**/
     int mtypes[MAX_ATOM_TYPES];
 } molecule;
 
 
+/**/
 typedef struct
 {
+    /**/
     real H;
-    real e_vdW, CEvd;
-    real e_ele, CEclmb;
+    /**/
+    real e_vdW;
+    /**/
+    real CEvd;
+    /**/
+    real e_ele;
+    /**/
+    real CEclmb;
 } LR_data;
 
 
+/**/
 typedef struct
 {
-    real a, b, c, d;
+    /**/
+    real a;
+    /**/
+    real b;
+    /**/
+    real c;
+    /**/
+    real d;
 } cubic_spline_coef;
 
 
-
+/**/
 typedef struct
 {
-    real xmin, xmax;
+    /**/
+    real xmin;
+    /**/
+    real xmax;
+    /**/
     int n;
-    real dx, inv_dx;
+    /**/
+    real dx;
+    /**/
+    real inv_dx;
+    /**/
     real a;
+    /**/
     real m;
+    /**/
     real c;
 
+    /**/
     LR_data *y;
+    /**/
     cubic_spline_coef *H;
-    cubic_spline_coef *vdW, *CEvd;
-    cubic_spline_coef *ele, *CEclmb;
+    /**/
+    cubic_spline_coef *vdW;
+    /**/
+    cubic_spline_coef *CEvd;
+    /**/
+    cubic_spline_coef *ele;
+    /**/
+    cubic_spline_coef *CEclmb;
 } LR_lookup_table;
 
-#ifndef HAVE_CUDA
-//extern LR_lookup_table **LR;
+
 extern LR_lookup_table *LR;
-#else
-extern LR_lookup_table *LR; //changed
-#endif
+
 
 /* function pointer defs */
-typedef void (*evolve_function)(reax_system*, control_params*,
-                                simulation_data*, storage*, reax_list**,
-                                output_controls*, mpi_datatypes* );
+typedef int (*evolve_function)(reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*, mpi_datatypes* );
 #if defined(PURE_REAX)
-extern evolve_function  Evolve;
+extern evolve_function Evolve;
 extern evolve_function Cuda_Evolve;
 #endif
 
-typedef void (*interaction_function) (reax_system*, control_params*,
-                                      simulation_data*, storage*,
-                                      reax_list**, output_controls*);
+typedef void (*interaction_function)(reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*);
 
 typedef void (*print_interaction)(reax_system*, control_params*,
-                                  simulation_data*, storage*,
-                                  reax_list**, output_controls*);
+        simulation_data*, storage*, reax_list**, output_controls*);
 
 typedef real (*lookup_function)(real);
 
-typedef void (*message_sorter) (reax_system*, int, int, int, mpi_out_data*);
-typedef void (*unpacker) ( reax_system*, int, void*, int, neighbor_proc*, int );
+typedef void (*message_sorter)(reax_system*, int, int, int, mpi_out_data*);
+typedef void (*unpacker)( reax_system*, int, void*, int, neighbor_proc*, int );
 
-typedef void (*dist_packer) (void*, mpi_out_data*);
-typedef void (*coll_unpacker) (void*, void*, mpi_out_data*);
+typedef void (*dist_packer)(void*, mpi_out_data*);
+typedef void (*coll_unpacker)(void*, void*, mpi_out_data*);
 
 /*CUDA-specific*/
 extern reax_list **dev_lists;
 extern storage *dev_workspace;
-extern storage *dev_storage;
 extern LR_lookup_table *d_LR;
 
 extern void *scratch;
diff --git a/PG-PuReMD/src/reduction.h b/PG-PuReMD/src/reduction.h
deleted file mode 100644
index 03454673cacb129e59a0d51c8782c3b425650e53..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/reduction.h
+++ /dev/null
@@ -1,30 +0,0 @@
-
-#ifndef __REDUCTION_H__
-#define __REDUCTION_H__
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-#define  INITIAL  0
-#define  FINAL    1
-
-
-CUDA_GLOBAL void k_reduction (const real *, real *, const size_t );
-CUDA_GLOBAL void k_reduction_rvec (rvec *, rvec *, size_t );
-CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *, rvec2 *, size_t );
-CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_t n, int pass);
-CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results, const size_t n);
-
-CUDA_GLOBAL void k_vector_sum( real* , real , real* , real , real* , int ) ;
-CUDA_GLOBAL void k_rvec2_pbetad (rvec2 *dest, rvec2 *a,
-                                 real beta0, real beta1,
-                                 rvec2 *b, int n);
-CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) ;
-CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) ;
-CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, const size_t n, int pass);
-CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t n);
-
-
-
-
-#endif
diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c
index e49ca306c289ee57eb20fea6b7910d403072743f..f2a24753f8748cdacf00a93160b687e14d28dfa8 100644
--- a/PG-PuReMD/src/reset_tools.c
+++ b/PG-PuReMD/src/reset_tools.c
@@ -21,24 +21,20 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
-#ifdef HAVE_CUDA
-#include "cuda_reset_tools.h"
-#endif
-
 #if defined(PURE_REAX)
-#include "reset_tools.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "reset_tools.h"
+  #include "list.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_reset_tools.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_reset_tools.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Reset_Atoms( reax_system* system, control_params *control )
 {
@@ -46,15 +42,17 @@ void Reset_Atoms( reax_system* system, control_params *control )
     reax_atom *atom;
 
     system->numH = 0;
-    if ( control->hbond_cut > 0 )
+    if ( control->hbond_cut > 0.0 )
+    {
         //TODO
         for ( i = 0; i < system->N; ++i )
         {
             atom = &(system->my_atoms[i]);
-            //if( system->reax_param.sbp[ atom->type ].p_hbond == 1 )
+            //if( system->reax_param.sbp[ atom->type ].p_hbond == H_ATOM )
             atom->Hindex = system->numH++;
             //else atom->Hindex = -1;
         }
+    }
 }
 
 
@@ -114,11 +112,13 @@ void Reset_Timing( reax_timing *rt )
     rt->init_forces = 0;
     rt->bonded = 0;
     rt->nonb = 0;
-    rt->qEq = 0;
+    rt->cm = 0;
     rt->s_matvecs = 0;
     rt->t_matvecs = 0;
+    rt->num_retries = 0;
 }
 
+
 #ifdef TEST_FORCES
 void Reset_Test_Forces( reax_system *system, storage *workspace )
 {
@@ -158,7 +158,9 @@ void Reset_Grid( grid *g )
     int i, j, k;
 
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
                 /*
@@ -166,10 +168,12 @@ void Reset_Grid( grid *g )
                 g->cells[i][j][k].str = 0;
                 g->cells[i][j][k].end = 0;
                 */
-                g->cells[ index_grid_3d (i, j, k, g) ].top = 0;
-                //g->cells[ index_grid_3d (i, j, k, g) ].str = 0;
-                //g->cells[ index_grid_3d (i, j, k, g) ].end = 0;
+                g->cells[ index_grid_3d(i, j, k, g) ].top = 0;
+                //g->cells[ index_grid_3d(i, j, k, g) ].str = 0;
+                //g->cells[ index_grid_3d(i, j, k, g) ].end = 0;
             }
+        }
+    }
 }
 
 
@@ -178,7 +182,9 @@ void Reset_Out_Buffers( mpi_out_data *out_buf, int n )
     int i;
 
     for ( i = 0; i < n; ++i )
+    {
         out_buf[i].cnt = 0;
+    }
 }
 
 
@@ -202,13 +208,14 @@ void Reset_Neighbor_Lists( reax_system *system, control_params *control,
             total_bonds += system->my_atoms[i].num_bonds;
         }
 //	Print_List(*lists + BONDS);
+
         /* is reallocation needed? */
         if ( total_bonds >= bonds->num_intrs * DANGER_ZONE )
         {
             workspace->realloc.bonds = 1;
             if ( total_bonds >= bonds->num_intrs )
             {
-                fprintf(stderr,
+                fprintf( stderr,
                         "p%d: not enough space for bonds! total=%d allocated=%d\n",
                         system->my_rank, total_bonds, bonds->num_intrs );
                 MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
@@ -263,14 +270,16 @@ void Reset_Neighbor_Lists( reax_system *system, control_params *control,
 
 
 void Reset( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace, reax_list **lists )
+        simulation_data *data, storage *workspace, reax_list **lists )
 {
     Reset_Atoms( system, control );
 
     Reset_Simulation_Data( data );
 
     if ( control->virial )
+    {
         Reset_Pressures( data );
+    }
 
     Reset_Workspace( system, workspace );
 
@@ -282,26 +291,3 @@ void Reset( reax_system *system, control_params *control,
 #endif
 
 }
-
-
-#ifdef HAVE_CUDA
-void Cuda_Reset( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists )
-{
-    Cuda_Reset_Atoms (system, control);
-
-    Reset_Simulation_Data (data);
-
-    if (control->virial)
-        Reset_Pressures ( data );
-
-    Cuda_Reset_Workspace ( system, workspace );
-
-    Cuda_Reset_Neighbor_Lists ( system, control, workspace, lists );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: reset done\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-}
-#endif
diff --git a/PG-PuReMD/src/reset_tools.h b/PG-PuReMD/src/reset_tools.h
index 28ca825b9f8962ffe0ed37d0bb0419096d506f64..001b7f578d33b4fdb749a558d5d40a55e3e49beb 100644
--- a/PG-PuReMD/src/reset_tools.h
+++ b/PG-PuReMD/src/reset_tools.h
@@ -24,19 +24,34 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Reset_Pressures( simulation_data* );
+
 void Reset_Simulation_Data( simulation_data* );
+
 void Reset_Timing( reax_timing* );
+
 void Reset_Workspace( reax_system*, storage* );
-void Reset_Neighbor_Lists(reax_system*, control_params*, storage*, reax_list**);
+
+void Reset_Neighbor_Lists( reax_system*, control_params*, storage*, reax_list** );
+
 void Reset_Grid( grid* );
+
 void Reset_Out_Buffers( mpi_out_data*, int );
-void Reset(reax_system*, control_params*, simulation_data*, storage*, reax_list**);
 
-//CUDA Functions
-void Cuda_Reset(reax_system*, control_params*, simulation_data*, storage*, reax_list**);
+void Reset( reax_system*, control_params*, simulation_data*, storage*, reax_list** );
 
 #ifdef TEST_FORCES
 void Reset_Test_Forces( reax_system*, storage* );
 #endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c
index 76c8041a840321c04e5f62112b496fcaf4b6ab28..6b8ddcdffc0e257eb57e0d4a00d43cf4734f379e 100644
--- a/PG-PuReMD/src/restart.c
+++ b/PG-PuReMD/src/restart.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "restart.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
@@ -27,10 +30,10 @@
 
 
 void Write_Binary_Restart( reax_system *system, control_params *control,
-                           simulation_data *data, output_controls *out_control,
-                           mpi_datatypes *mpi_data )
+        simulation_data *data, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
-    int  i, me, np, cnt, top;
+    int i, me, np, cnt, top;
     char fname[MAX_STR];
     FILE *fres;
     restart_header res_header;
@@ -48,7 +51,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
         sprintf( fname, "%s.res%d", control->sim_name, data->step );
         if ( (fres = fopen( fname, "wb" )) == NULL )
         {
-            fprintf( stderr, "ERROR: can't open the restart file! terminating...\n" );
+            fprintf( stderr, "[ERROR] can't open the restart file! terminating...\n" );
             MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
         }
 
@@ -65,11 +68,13 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
 
         /* master needs to allocate space for all atoms */
         buffer = (restart_atom*)
-                 scalloc( system->bigN, sizeof(restart_atom), "restart:buffer" );
+            scalloc( system->bigN, sizeof(restart_atom), "restart:buffer" );
     }
     else
+    {
         buffer = (restart_atom*)
-                 scalloc( system->n, sizeof(restart_atom), "restart:buffer" );
+            scalloc( system->n, sizeof(restart_atom), "restart:buffer" );
+    }
 
     /* fill in the buffers */
     for ( i = 0 ; i < system->n; ++i )
@@ -92,6 +97,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
     {
         top = system->n;
         for ( i = 0; i < np; ++i )
+        {
             if ( i != MASTER_NODE )
             {
                 MPI_Recv( buffer + top, system->bigN - top, mpi_data->restart_atom_type,
@@ -99,6 +105,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
                 MPI_Get_count( &status, mpi_data->restart_atom_type, &cnt );
                 top += cnt;
             }
+        }
     }
 
     /* master node dumps out the restart file */
@@ -108,15 +115,15 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
         fclose( fres );
     }
 
-    free(buffer);
+    sfree( buffer, "Write_Binary_Restart::buffer" );
 }
 
 
 void Write_Restart( reax_system *system, control_params *control,
-                    simulation_data *data, output_controls *out_control,
-                    mpi_datatypes *mpi_data )
+        simulation_data *data, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
-    int  i, me, np, buffer_len, buffer_req, cnt;
+    int i, me, np, buffer_len, buffer_req, cnt;
     char fname[MAX_STR];
     FILE *fres;
     char *line;
@@ -134,7 +141,7 @@ void Write_Restart( reax_system *system, control_params *control,
         sprintf( fname, "%s.res%d", control->sim_name, data->step );
         if ( (fres = fopen( fname, "w" )) == NULL )
         {
-            fprintf( stderr, "ERROR: can't open the restart file! terminating...\n" );
+            fprintf( stderr, "[ERROR] can't open the restart file! terminating...\n" );
             MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
         }
 
@@ -153,7 +160,9 @@ void Write_Restart( reax_system *system, control_params *control,
         buffer_req = system->bigN * RESTART_LINE_LEN + 1;
     }
     else
+    {
         buffer_req = system->n * RESTART_LINE_LEN + 1;
+    }
 
     buffer = (char*) smalloc(sizeof(char) * buffer_req, "restart:buffer");
     line[0] = 0;
@@ -177,19 +186,21 @@ void Write_Restart( reax_system *system, control_params *control,
     if ( me != MASTER_NODE )
     {
         MPI_Send( buffer, buffer_req - 1, MPI_CHAR, MASTER_NODE,
-                  np * RESTART_LINE_LEN + me, mpi_data->world );
+                np * RESTART_LINE_LEN + me, mpi_data->world );
     }
     else
     {
         buffer_len = system->n * RESTART_LINE_LEN;
         for ( i = 0; i < np; ++i )
+        {
             if ( i != MASTER_NODE )
             {
-                MPI_Recv(buffer + buffer_len, buffer_req - buffer_len,
-                         MPI_CHAR, i, np * RESTART_LINE_LEN + i, mpi_data->world, &status);
+                MPI_Recv( buffer + buffer_len, buffer_req - buffer_len,
+                        MPI_CHAR, i, np * RESTART_LINE_LEN + i, mpi_data->world, &status );
                 MPI_Get_count( &status, MPI_CHAR, &cnt );
                 buffer_len += cnt;
             }
+        }
         buffer[buffer_len] = 0;
     }
 
@@ -199,8 +210,8 @@ void Write_Restart( reax_system *system, control_params *control,
         fprintf( fres, "%s", buffer );
         fclose( fres );
     }
-    free(buffer);
-    free(line);
+    sfree( buffer, "Write_Restart::buffer" );
+    sfree( line, "Write_Restart::line" );
 }
 
 
@@ -217,7 +228,9 @@ void Count_Binary_Restart_Atoms( FILE *fres, reax_system *system )
         /* if the point is inside my_box, add it to my lists */
         Fit_to_Periodic_Box( &(system->big_box), &(res_atom.x) );
         if ( is_Inside_Box(&(system->my_box), res_atom.x) )
+        {
             ++system->n;
+        }
     }
     system->N = system->n;
 
@@ -231,8 +244,8 @@ void Count_Binary_Restart_Atoms( FILE *fres, reax_system *system )
 
 
 void Read_Binary_Restart( char *res_file, reax_system *system,
-                          control_params *control, simulation_data *data,
-                          storage *workspace, mpi_datatypes *mpi_data )
+        control_params *control, simulation_data *data,
+        storage *workspace, mpi_datatypes *mpi_data )
 {
     int i, top;
     FILE *fres;
@@ -242,7 +255,7 @@ void Read_Binary_Restart( char *res_file, reax_system *system,
 
     if ( (fres = fopen(res_file, "rb")) == NULL )
     {
-        fprintf( stderr, "ERROR: cannot open the restart file! terminating...\n" );
+        fprintf( stderr, "[ERROR] cannot open the restart file! terminating...\n" );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
@@ -330,7 +343,9 @@ void Count_Restart_Atoms( FILE *fres, reax_system *system )
         Fit_to_Periodic_Box( &(system->big_box), &x_temp );
         /* if the point is inside my_box, add it to my lists */
         if ( is_Inside_Box(&(system->my_box), x_temp) )
+        {
             ++system->n;
+        }
     }
     system->N = system->n;
 
@@ -344,8 +359,8 @@ void Count_Restart_Atoms( FILE *fres, reax_system *system )
 
 
 void Read_Restart( char *res_file, reax_system *system,
-                   control_params *control, simulation_data *data,
-                   storage *workspace, mpi_datatypes *mpi_data )
+        control_params *control, simulation_data *data,
+        storage *workspace, mpi_datatypes *mpi_data )
 {
     int i, c, top;
     FILE *fres;
@@ -357,19 +372,25 @@ void Read_Restart( char *res_file, reax_system *system,
 
     if ( (fres = fopen(res_file, "r")) == NULL )
     {
-        fprintf( stderr, "ERROR: cannot open the restart file! terminating...\n" );
+        fprintf( stderr, "[ERROR] cannot open the restart file! terminating...\n" );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    s = (char*) smalloc( sizeof(char) * MAX_LINE, "Read_Restart::s" );
+    tmp = (char**) smalloc( sizeof(char*) * MAX_TOKENS, "Read_Restart::tmp" );
     for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
-
+    {
+        tmp[i] = (char*) smalloc( sizeof(char) * MAX_LINE, "Read_Restart::tmp[i]" );
+    }
 
     //read first header lines
     fgets( s, MAX_LINE, fres );
     c = Tokenize( s, &tmp );
+    if ( c != 7 )
+    {
+        fprintf( stderr, "[ERROR] invalid format in restart file! terminating...\n" );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
     data->prev_steps = atoi(tmp[0]);
     system->bigN = atoi(tmp[1]);
     data->therm.T = atof(tmp[2]);
@@ -381,16 +402,31 @@ void Read_Restart( char *res_file, reax_system *system,
     //read box lines
     fgets( s, MAX_LINE, fres );
     c = Tokenize( s, &tmp );
+    if ( c != 3 )
+    {
+        fprintf( stderr, "[ERROR] invalid format in restart file! terminating...\n" );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
     box[0][0] = atof(tmp[0]);
     box[0][1] = atof(tmp[1]);
     box[0][2] = atof(tmp[2]);
     fgets( s, MAX_LINE, fres );
     c = Tokenize( s, &tmp );
+    if ( c != 3 )
+    {
+        fprintf( stderr, "[ERROR] invalid format in restart file! terminating...\n" );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
     box[1][0] = atof(tmp[0]);
     box[1][1] = atof(tmp[1]);
     box[1][2] = atof(tmp[2]);
     fgets( s, MAX_LINE, fres );
     c = Tokenize( s, &tmp );
+    if ( c != 3 )
+    {
+        fprintf( stderr, "[ERROR] invalid format in restart file! terminating...\n" );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+    }
     box[2][0] = atof(tmp[0]);
     box[2][1] = atof(tmp[1]);
     box[2][2] = atof(tmp[2]);
@@ -421,7 +457,9 @@ void Read_Restart( char *res_file, reax_system *system,
     /* go back to the start of file to read actual atom info */
     rewind( fres );
     for (i = 0; i < 4; i++)
+    {
         fgets( s, MAX_LINE, fres );
+    }
 
     /*process atoms*/
     top = 0;
@@ -429,6 +467,11 @@ void Read_Restart( char *res_file, reax_system *system,
     {
         fgets( s, MAX_LINE, fres );
         c = Tokenize( s, &tmp );
+        if ( c != 9 )
+        {
+            fprintf( stderr, "[ERROR] invalid format in restart file! terminating...\n" );
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+        }
         orig_id_temp = atoi(tmp[0]);
         type_temp = atoi(tmp[1]);
         strncpy(name_temp, tmp[2], 8);
@@ -452,11 +495,14 @@ void Read_Restart( char *res_file, reax_system *system,
         }
     }
     fclose( fres );
+
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+    {
+        sfree( tmp[i], "Read_Restart::tmp[i]" );
+    }
+    sfree( tmp, "Read_Restart::tmp" );
+    sfree( s, "Read_Restart::s" );
 
     data->step = data->prev_steps;
     // nsteps is updated based on the number of steps in the previous run
diff --git a/PG-PuReMD/src/restart.h b/PG-PuReMD/src/restart.h
index 39a5dcd5a149208a58f5085507bb198895d7e47d..3d13a5a17256457829224a20f32c4d9f7305a06b 100644
--- a/PG-PuReMD/src/restart.h
+++ b/PG-PuReMD/src/restart.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 #define RESTART_HEADER "%8d%12d%8.3f%8.3f%8.3f%8.3f%8.3f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n"
 #define RESTART_HEADER_LINE_LEN 200
 /* step, system->bigN, data->therm.T, data->therm.xi,
@@ -39,16 +40,26 @@
 #define READ_RESTART_HEADER " %d %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf"
 #define READ_RESTART_LINE " %d %d %s %lf %lf %lf %lf %lf %lf"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Write_Binary_Restart( reax_system*, control_params*,
-                           simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Write_Restart( reax_system*, control_params*,
-                    simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Read_Binary_Restart( char*, reax_system*, control_params*,
-                          simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 void Read_Restart( char*, reax_system*, control_params*,
-                   simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/system_props.c b/PG-PuReMD/src/system_props.c
index 4f36288e6b7090d68d50608ba45998a252891ee7..c983afc1cad8fbc324fb7977a921c96dbfbe23f2 100644
--- a/PG-PuReMD/src/system_props.c
+++ b/PG-PuReMD/src/system_props.c
@@ -20,18 +20,19 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#ifdef HAVE_CUDA
-#include "dev_system_props.h"
-#endif
 
 #if defined(PURE_REAX)
-#include "system_props.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "system_props.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_system_props.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_system_props.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
+#endif
+
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_system_props.h"
 #endif
 
 
@@ -39,28 +40,36 @@ void Temperature_Control( control_params *control, simulation_data *data )
 {
     real tmp;
 
-    if ( control->T_mode == 1 ) // step-wise temperature control
+    /* step-wise temperature control */
+    if ( control->T_mode == 1 )
     {
         if ((data->step - data->prev_steps) % ((int)(control->T_freq / control->dt)) == 0)
         {
-            if ( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+            if ( FABS( control->T - control->T_final ) >= FABS( control->T_rate ) )
+            {
                 control->T += control->T_rate;
-            else control->T = control->T_final;
+            }
+            else
+            {
+                control->T = control->T_final;
+            }
         }
     }
-    else if ( control->T_mode == 2 )  // constant slope control
+    /* constant slope control */
+    else if ( control->T_mode == 2 )
     {
         tmp = control->T_rate * control->dt / control->T_freq;
 
-        if ( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+        if ( FABS( control->T - control->T_final ) >= FABS( tmp ) )
+        {
             control->T += tmp;
+        }
     }
 }
 
 
-
 void Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
-                             MPI_Comm comm )
+        MPI_Comm comm )
 {
     int i;
     rvec p;
@@ -77,51 +86,28 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
     }
 
     MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
-                   1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
-
-    // avoid T being an absolute zero, might cause F.P.E!
-    if ( fabs(data->therm.T) < ALMOST_ZERO )
-        data->therm.T = ALMOST_ZERO;
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
-                                  MPI_Comm comm )
-{
-    int i;
-    rvec p;
-    real m;
-
-    data->my_en.e_kin = 0.0;
-
-    dev_compute_kinetic_energy (system, data, &data->my_en.e_kin);
-
-    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
-                   1, MPI_DOUBLE, MPI_SUM, comm );
+            1, MPI_DOUBLE, MPI_SUM, comm );
 
     data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
 
     // avoid T being an absolute zero, might cause F.P.E!
-    if ( fabs(data->therm.T) < ALMOST_ZERO )
+    if ( FABS(data->therm.T) < ALMOST_ZERO )
+    {
         data->therm.T = ALMOST_ZERO;
+    }
 }
-#endif
 
 
 void Compute_System_Energy( reax_system *system, simulation_data *data,
-                            MPI_Comm comm )
+        MPI_Comm comm )
 {
     real my_en[15], sys_en[15];
 
     //TODO remove this is an UGLY fix
-    my_en [13] = data->my_en.e_kin;
+    my_en[13] = data->my_en.e_kin;
 
 #ifdef HAVE_CUDA
-    //Cuda Wrapper here
-    dev_sync_simulation_data ( data );
+    dev_sync_simulation_data( data );
 #endif
 
     my_en[0] = data->my_en.e_bond;
@@ -141,11 +127,11 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
     MPI_Reduce( my_en, sys_en, 14, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
 
     data->my_en.e_pot = data->my_en.e_bond +
-                        data->my_en.e_ov + data->my_en.e_un  + data->my_en.e_lp +
-                        data->my_en.e_ang + data->my_en.e_pen + data->my_en.e_coa +
-                        data->my_en.e_hb +
-                        data->my_en.e_tor + data->my_en.e_con +
-                        data->my_en.e_vdW + data->my_en.e_ele + data->my_en.e_pol;
+        data->my_en.e_ov + data->my_en.e_un  + data->my_en.e_lp +
+        data->my_en.e_ang + data->my_en.e_pen + data->my_en.e_coa +
+        data->my_en.e_hb +
+        data->my_en.e_tor + data->my_en.e_con +
+        data->my_en.e_vdW + data->my_en.e_ele + data->my_en.e_pol;
 
     data->my_en.e_tot = data->my_en.e_pot + E_CONV * data->my_en.e_kin;
 
@@ -167,11 +153,11 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
         data->sys_en.e_kin = sys_en[13];
 
         data->sys_en.e_pot = data->sys_en.e_bond +
-                             data->sys_en.e_ov + data->sys_en.e_un  + data->sys_en.e_lp +
-                             data->sys_en.e_ang + data->sys_en.e_pen + data->sys_en.e_coa +
-                             data->sys_en.e_hb +
-                             data->sys_en.e_tor + data->sys_en.e_con +
-                             data->sys_en.e_vdW + data->sys_en.e_ele + data->sys_en.e_pol;
+            data->sys_en.e_ov + data->sys_en.e_un  + data->sys_en.e_lp +
+            data->sys_en.e_ang + data->sys_en.e_pen + data->sys_en.e_coa +
+            data->sys_en.e_hb +
+            data->sys_en.e_tor + data->sys_en.e_con +
+            data->sys_en.e_vdW + data->sys_en.e_ele + data->sys_en.e_pol;
 
         data->sys_en.e_tot = data->sys_en.e_pot + E_CONV * data->sys_en.e_kin;
     }
@@ -179,14 +165,16 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
 
 
 void Compute_Total_Mass( reax_system *system, simulation_data *data,
-                         MPI_Comm comm  )
+        MPI_Comm comm  )
 {
-    int  i;
+    int i;
     real tmp;
 
     tmp = 0;
     for ( i = 0; i < system->n; i++ )
+    {
         tmp += system->reax_param.sbp[ system->my_atoms[i].type ].mass;
+    }
 
     MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -194,25 +182,8 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
-                              MPI_Comm comm  )
-{
-    int  i;
-    real tmp;
-
-    //compute local total mass of the system
-    dev_compute_total_mass (system, &tmp);
-
-    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->inv_M = 1. / data->M;
-}
-#endif
-
-
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-                             mpi_datatypes *mpi_data, MPI_Comm comm )
+        mpi_datatypes *mpi_data, MPI_Comm comm )
 {
     int i;
     real m, det; //xx, xy, xz, yy, yz, zz;
@@ -250,7 +221,9 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 
     /* Calculate and then invert the inertial tensor */
     for ( i = 0; i < 6; ++i )
+    {
         tmp_mat[i] = 0;
+    }
     //my_xx = my_xy = my_xz = my_yy = my_yz = my_zz = 0;
 
     for ( i = 0; i < system->n; ++i )
@@ -296,106 +269,13 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
         if ( det > ALMOST_ZERO )
+        {
             rtensor_Scale( inv, 1. / det, inv );
-        else rtensor_MakeZero( inv );
-
-        /* Compute the angular velocity about the centre of mass */
-        rtensor_MatVec( data->avcm, inv, data->amcm );
-    }
-
-    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
-
-    /* Compute the rotational energy */
-    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-
-#if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
-             data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
-             data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
-             data->amcm[0], data->amcm[1], data->amcm[2] );
-    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2],
-       mat[1][0], mat[1][1], mat[1][2],
-       mat[2][0], mat[2][1], mat[2][2] );
-       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2],
-       inv[1][0], inv[1][1], inv[1][2],
-       inv[2][0], inv[2][1], inv[2][2] ); */
-    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
-             data->avcm[0], data->avcm[1], data->avcm[2] );
-#endif
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-                                  mpi_datatypes *mpi_data, MPI_Comm comm )
-{
-    int i;
-    real m, det; //xx, xy, xz, yy, yz, zz;
-    real tmp_mat[6], tot_mat[6];
-    rvec my_xcm, my_vcm, my_amcm, my_avcm;
-    rvec tvec, diff;
-    rtensor mat, inv;
-
-    rvec_MakeZero( my_xcm );  // position of CoM
-    rvec_MakeZero( my_vcm );  // velocity of CoM
-    rvec_MakeZero( my_amcm ); // angular momentum of CoM
-    rvec_MakeZero( my_avcm ); // angular velocity of CoM
-
-    /* Compute the position, vel. and ang. momentum about the centre of mass */
-    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
-
-    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-
-    rvec_Scale( data->xcm, data->inv_M, data->xcm );
-    rvec_Scale( data->vcm, data->inv_M, data->vcm );
-    rvec_Cross( tvec, data->xcm, data->vcm );
-    rvec_ScaledAdd( data->amcm, -data->M, tvec );
-    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-    /* Calculate and then invert the inertial tensor */
-    for ( i = 0; i < 6; ++i )
-        tmp_mat[i] = 0;
-
-    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
-
-    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
-
-    if ( system->my_rank == MASTER_NODE )
-    {
-        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
-        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
-        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
-        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
-        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
-        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
-
-        /* invert the inertial tensor */
-        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
-                mat[0][1] * mat[1][2] * mat[2][0] +
-                mat[0][2] * mat[1][0] * mat[2][1] ) -
-              ( mat[0][0] * mat[1][2] * mat[2][1] +
-                mat[0][1] * mat[1][0] * mat[2][2] +
-                mat[0][2] * mat[1][1] * mat[2][0] );
-
-        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-        if ( det > ALMOST_ZERO )
-            rtensor_Scale( inv, 1. / det, inv );
-        else rtensor_MakeZero( inv );
+        }
+        else
+        {
+            rtensor_MakeZero( inv );
+        }
 
         /* Compute the angular velocity about the centre of mass */
         rtensor_MatVec( data->avcm, inv, data->amcm );
@@ -425,7 +305,6 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
              data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
-#endif
 
 
 /* IMPORTANT: This function assumes that current kinetic energy
@@ -436,8 +315,8 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
  *  corrections to short-range interactions present.
  *  We may want to add that for more accuracy.
  */
-void Compute_Pressure(reax_system* system, control_params *control,
-                      simulation_data* data, mpi_datatypes *mpi_data)
+void Compute_Pressure( reax_system* system, control_params *control,
+        simulation_data* data, mpi_datatypes *mpi_data )
 {
     int i;
     reax_atom *p_atom;
@@ -472,16 +351,18 @@ void Compute_Pressure(reax_system* system, control_params *control,
         }
     }
 
-    /* sum up internal and external pressure */
 #if defined(DEBUG)
     fprintf(stderr, "p%d:p_int(%10.5f %10.5f %10.5f)p_ext(%10.5f %10.5f %10.5f)\n",
             system->my_rank, int_press[0], int_press[1], int_press[2],
             data->my_ext_press[0], data->my_ext_press[1], data->my_ext_press[2] );
 #endif
+
+    /* sum up internal and external pressure */
     MPI_Allreduce( int_press, data->int_press,
-                   3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
     MPI_Allreduce( data->my_ext_press, data->ext_press,
-                   3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
              system->my_rank,
@@ -492,20 +373,21 @@ void Compute_Pressure(reax_system* system, control_params *control,
 #endif
 
     /* kinetic contribution */
-    data->kin_press = 2.*(E_CONV * data->sys_en.e_kin) / (3.*big_box->V * P_CONV);
+    data->kin_press = 2. * (E_CONV * data->sys_en.e_kin)
+        / (3. * big_box->V * P_CONV);
 
     /* Calculate total pressure in each direction */
     data->tot_press[0] = data->kin_press -
-                         (( data->int_press[0] + data->ext_press[0] ) /
-                          ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
+        (( data->int_press[0] + data->ext_press[0] ) /
+         ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
 
     data->tot_press[1] = data->kin_press -
-                         (( data->int_press[1] + data->ext_press[1] ) /
-                          ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
+        (( data->int_press[1] + data->ext_press[1] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
 
     data->tot_press[2] = data->kin_press -
-                         (( data->int_press[2] + data->ext_press[2] ) /
-                          ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
+        (( data->int_press[2] + data->ext_press[2] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
 
     /* Average pressure for the whole box */
     data->iso_bar.P =
@@ -513,7 +395,6 @@ void Compute_Pressure(reax_system* system, control_params *control,
 }
 
 
-
 /*
 void Compute_Pressure_Isotropic_Klein( reax_system* system,
                        simulation_data* data )
diff --git a/PG-PuReMD/src/system_props.h b/PG-PuReMD/src/system_props.h
index 5efff3c561019e4dbcad2b1de338f6d79f5d85b1..f04a9590ca20da6a020c7ad9ed0ad8023f8c3024 100644
--- a/PG-PuReMD/src/system_props.h
+++ b/PG-PuReMD/src/system_props.h
@@ -24,6 +24,11 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Temperature_Control( control_params*, simulation_data* );
 
 void Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
@@ -33,16 +38,16 @@ void Compute_System_Energy( reax_system*, simulation_data*, MPI_Comm );
 void Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
 
 void Compute_Center_of_Mass( reax_system*, simulation_data*,
-                             mpi_datatypes*, MPI_Comm );
+        mpi_datatypes*, MPI_Comm );
 
 void Compute_Pressure( reax_system*, control_params*,
-                       simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
+
 //void Compute_Pressure( reax_system*, simulation_data* );
 
-//CUDA Functions
-void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
-                                  mpi_datatypes*, MPI_Comm );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/tool_box.c b/PG-PuReMD/src/tool_box.c
index 09e44ea319f6d207842b49ba810a5211da2888f1..58772d773252982721b9ba1385ca932123632f02 100644
--- a/PG-PuReMD/src/tool_box.c
+++ b/PG-PuReMD/src/tool_box.c
@@ -20,23 +20,24 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "tool_box.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_tool_box.h"
+  #include "reax_tool_box.h"
 #endif
 
 
 /************** taken from comm_tools.c **************/
 int SumScan( int n, int me, int root, MPI_Comm comm )
 {
-    int  i, my_order, wsize;;
+    int i, my_order, wsize;
     int *nbuf = NULL;
 
     if ( me == root )
     {
         MPI_Comm_size( comm, &wsize );
-        nbuf = (int *) calloc( wsize, sizeof(int) );
+        nbuf = (int *) scalloc( wsize, sizeof(int), "SumScan:nbuf" );
 
         MPI_Gather( &n, 1, MPI_INT, nbuf, 1, MPI_INT, root, comm );
 
@@ -47,7 +48,7 @@ int SumScan( int n, int me, int root, MPI_Comm comm )
 
         MPI_Scatter( nbuf, 1, MPI_INT, &my_order, 1, MPI_INT, root, comm );
 
-        free( nbuf );
+        sfree( nbuf, "SumScan:nbuf" );
     }
     else
     {
@@ -61,59 +62,19 @@ int SumScan( int n, int me, int root, MPI_Comm comm )
 
 void SumScanB( int n, int me, int wsize, int root, MPI_Comm comm, int *nbuf )
 {
-    int  i;
+    int i;
 
     MPI_Gather( &n, 1, MPI_INT, nbuf, 1, MPI_INT, root, comm );
 
     if ( me == root )
     {
         for ( i = 0; i < wsize - 1; ++i )
-            nbuf[i + 1] += nbuf[i];
-    }
-
-    MPI_Bcast( nbuf, wsize, MPI_INT, root, comm );
-}
-
-
-/************** taken from box.c **************/
-void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    int i, j;
-    real tmp;
-
-    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
-
-    if (flag > 0)
-    {
-        for (i = 0; i < 3; i++)
-        {
-            tmp = 0.0;
-            for (j = 0; j < 3; j++)
-                tmp += box->trans[i][j] * x1[j];
-            x2[i] = tmp;
-        }
-    }
-    else
-    {
-        for (i = 0; i < 3; i++)
         {
-            tmp = 0.0;
-            for (j = 0; j < 3; j++)
-                tmp += box->trans_inv[i][j] * x1[j];
-            x2[i] = tmp;
+            nbuf[i + 1] += nbuf[i];
         }
     }
-    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);
-}
 
-
-void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    Transform( x1, box, flag, x2 );
-
-    x2[0] /= box->box_norms[0];
-    x2[1] /= box->box_norms[1];
-    x2[2] /= box->box_norms[2];
+    MPI_Bcast( nbuf, wsize, MPI_INT, root, comm );
 }
 
 
@@ -128,139 +89,21 @@ void Fit_to_Periodic_Box( simulation_box *box, rvec *p )
         {
             /* handle lower coords */
             while ( (*p)[i] < box->min[i] )
+            {
                 (*p)[i] += box->box_norms[i];
+            }
         }
         else if ( (*p)[i] >= box->max[i] )
         {
             /* handle higher coords */
             while ( (*p)[i] >= box->max[i] )
+            {
                 (*p)[i] -= box->box_norms[i];
+            }
         }
     }
 }
 
-#if defined(SUDHIR)
-/* determine the touch point, tp, of a box to
-   its neighbor denoted by the relative coordinate rl */
-inline void Box_Touch_Point( simulation_box *box, ivec rl, rvec tp )
-{
-    int d;
-
-    for ( d = 0; d < 3; ++d )
-        if ( rl[d] == -1 )
-            tp[d] = box->min[d];
-        else if ( rl[d] == 0 )
-            tp[d] = NEG_INF - 1.;
-        else
-            tp[d] = box->max[d];
-}
-
-
-/* determine whether point p is inside the box */
-/* assumes orthogonal box */
-inline int is_Inside_Box( simulation_box *box, rvec p )
-{
-    if ( p[0] < box->min[0] || p[0] >= box->max[0] ||
-            p[1] < box->min[1] || p[1] >= box->max[1] ||
-            p[2] < box->min[2] || p[2] >= box->max[2] )
-        return 0;
-
-    return 1;
-}
-
-
-inline int iown_midpoint( simulation_box *box, rvec p1, rvec p2 )
-{
-    rvec midp;
-
-    midp[0] = (p1[0] + p2[0]) / 2;
-    midp[1] = (p1[1] + p2[1]) / 2;
-    midp[2] = (p1[2] + p2[2]) / 2;
-
-    if ( midp[0] < box->min[0] || midp[0] >= box->max[0] ||
-            midp[1] < box->min[1] || midp[1] >= box->max[1] ||
-            midp[2] < box->min[2] || midp[2] >= box->max[2] )
-        return 0;
-
-    return 1;
-}
-
-
-
-/**************** from grid.c ****************/
-/* finds the closest point of grid cell cj to ci.
-   no need to consider periodic boundary conditions as in the serial case
-   because the box of a process is not periodic in itself */
-inline void GridCell_Closest_Point( grid_cell *gci, grid_cell *gcj,
-                                    ivec ci, ivec cj, rvec cp )
-{
-    int  d;
-
-    for ( d = 0; d < 3; d++ )
-        if ( cj[d] > ci[d] )
-            cp[d] = gcj->min[d];
-        else if ( cj[d] == ci[d] )
-            cp[d] = NEG_INF - 1.;
-        else
-            cp[d] = gcj->max[d];
-}
-
-
-
-inline void GridCell_to_Box_Points( grid_cell *gc, ivec rl, rvec cp, rvec fp )
-{
-    int d;
-
-    for ( d = 0; d < 3; ++d )
-        if ( rl[d] == -1 )
-        {
-            cp[d] = gc->min[d];
-            fp[d] = gc->max[d];
-        }
-        else if ( rl[d] == 0 )
-        {
-            cp[d] = fp[d] = NEG_INF - 1.;
-        }
-        else
-        {
-            cp[d] = gc->max[d];
-            fp[d] = gc->min[d];
-        }
-}
-
-
-inline real DistSqr_between_Special_Points( rvec sp1, rvec sp2 )
-{
-    int  i;
-    real d_sqr = 0;
-
-    for ( i = 0; i < 3; ++i )
-        if ( sp1[i] > NEG_INF && sp2[i] > NEG_INF )
-            d_sqr += SQR( sp1[i] - sp2[i] );
-
-    return d_sqr;
-}
-
-
-inline real DistSqr_to_Special_Point( rvec cp, rvec x )
-{
-    int  i;
-    real d_sqr = 0;
-
-    for ( i = 0; i < 3; ++i )
-        if ( cp[i] > NEG_INF )
-            d_sqr += SQR( cp[i] - x[i] );
-
-    return d_sqr;
-}
-
-
-inline int Relative_Coord_Encoding( ivec c )
-{
-    return 9 * (c[0] + 1) + 3 * (c[1] + 1) + (c[2] + 1);
-}
-#endif
-
 
 /************** from geo_tools.c *****************/
 void Make_Point( real x, real y, real z, rvec* p )
@@ -271,22 +114,20 @@ void Make_Point( real x, real y, real z, rvec* p )
 }
 
 
-
 int is_Valid_Serial( storage *workspace, int serial )
 {
-    // if( workspace->map_serials[ serial ] < 0 )
-    // {
-    // fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n",
-    // serial );
-    // fprintf( stderr, "Please correct the input file.Terminating...\n" );
-    //  MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
-    // }
+//    if( workspace->map_serials[ serial ] < 0 )
+//    {
+//        fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n",
+//                serial );
+//        fprintf( stderr, "Please correct the input file.Terminating...\n" );
+//        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+//    }
 
     return SUCCESS;
 }
 
 
-
 int Check_Input_Range( int val, int lo, int hi, char *message )
 {
     if ( val < lo || val > hi )
@@ -304,36 +145,49 @@ void Trim_Spaces( char *element )
 {
     int i, j;
 
-    for ( i = 0; element[i] == ' '; ++i ); // skip initial space chars
+    // skip initial space chars
+    for ( i = 0; element[i] == ' '; ++i );
 
     for ( j = i; j < (int)(strlen(element)) && element[j] != ' '; ++j )
-        element[j - i] = toupper( element[j] ); // make uppercase, offset to 0
-    element[j - i] = 0; // finalize the string
+    {
+        // make uppercase, offset to 0
+        element[j - i] = toupper( element[j] );
+    }
+    // finalize the string
+    element[j - i] = 0;
 }
 
 
 /************ from system_props.c *************/
-struct timeval tim;
-real t_end;
-
 real Get_Time( )
 {
-    gettimeofday(&tim, NULL );
+    struct timeval tim;
+
+    gettimeofday( &tim, NULL );
+
     return ( tim.tv_sec + (tim.tv_usec / 1000000.0) );
 }
 
 
 real Get_Timing_Info( real t_start )
 {
+    struct timeval tim;
+    real t_end;
+
     gettimeofday(&tim, NULL );
     t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+
     return (t_end - t_start);
 }
 
 
 void Update_Timing_Info( real *t_start, real *timing )
 {
-    gettimeofday(&tim, NULL );
+    struct timeval tim;
+    real t_end;
+
+    gettimeofday( &tim, NULL );
+
     t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
     *timing += (t_end - *t_start);
     *t_start = t_end;
@@ -346,8 +200,12 @@ int Get_Atom_Type( reax_interaction *reax_param, char *s )
     int i;
 
     for ( i = 0; i < reax_param->num_atom_types; ++i )
+    {
         if ( !strcmp( reax_param->sbp[i].name, s ) )
+        {
             return i;
+        }
+    }
 
     fprintf( stderr, "Unknown atom type %s. Terminating...\n", s );
     MPI_Abort( MPI_COMM_WORLD, UNKNOWN_ATOM_TYPE );
@@ -356,44 +214,36 @@ int Get_Atom_Type( reax_interaction *reax_param, char *s )
 }
 
 
-
 char *Get_Element( reax_system *system, int i )
 {
     return &( system->reax_param.sbp[system->my_atoms[i].type].name[0] );
 }
 
 
-
 char *Get_Atom_Name( reax_system *system, int i )
 {
     return &(system->my_atoms[i].name[0]);
 }
 
 
-
-int Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
+void Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
 {
     int i;
 
-    if ( (*line = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
-        return FAILURE;
+    *line = (char*) smalloc( sizeof(char) * MAX_LINE, "Tokenizer:line" );
 
-    if ( (*backup = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
-        return FAILURE;
+    *backup = (char*) smalloc( sizeof(char) * MAX_LINE, "Tokenizer:backup" );
 
-    if ( (*tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS )) == NULL )
-        return FAILURE;
+    *tokens = (char**) smalloc( sizeof(char*) * MAX_TOKENS, "Tokenizer:tokens" );
 
     for ( i = 0; i < MAX_TOKENS; i++ )
-        if ( ((*tokens)[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN)) == NULL )
-            return FAILURE;
-
-    return SUCCESS;
+    {
+        (*tokens)[i] = (char*) smalloc(sizeof(char) * MAX_TOKEN_LEN, "Tokenizer:tokens[i]" );
+    }
 }
 
 
-
-int Tokenize( char* s, char*** tok )
+int Tokenize( const char* s, char*** tok )
 {
     char test[MAX_LINE];
     char *sep = "\t \n!=";
@@ -413,77 +263,155 @@ int Tokenize( char* s, char*** tok )
 
 
 /***************** taken from lammps ************************/
-/* safe malloc */
-void* smalloc( long n, char *name )
+/* Safe wrapper around libc malloc
+ *
+ * n: num. of bytes to allocated
+ * name: message with details about pointer, used for warnings/errors
+ *
+ * returns: ptr to allocated memory
+ * */
+void* smalloc( size_t n, const char *name )
 {
     void *ptr;
 
-    if ( n <= 0 )
+    if ( n == 0 )
     {
-        fprintf( stderr, "WARNING: trying to allocate %ld bytes for array %s. ",
-                 n, name );
-        fprintf( stderr, "returning NULL.\n" );
-        return NULL;
+        fprintf( stderr, "[ERROR] failed to allocate %zu bytes for array %s.\n",
+                n, name );
+        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
     }
-    //printf("requesting memory for %s \n", name);
-    //malloc( n );
-    //printf("successfuly requested memory for %s \n", name);
+
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] requesting memory for %s\n", name );
+#endif
+
     ptr = malloc( n );
-    // printf("successfuly assigned pointer for %s \n", name);
+
     if ( ptr == NULL )
     {
-        fprintf( stderr, "ERROR: failed to allocate %ld bytes for array %s",
-                 n, name );
+        fprintf( stderr, "[ERROR] failed to allocate %zu bytes for array %s.\n",
+                n, name );
         MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
     }
 
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] successfuly assigned pointer for %s\n", name );
+#endif
+
     return ptr;
 }
 
 
-/* safe calloc */
-void *scalloc( int n, int size, char *name )
+/* Safe wrapper around libc realloc
+ *
+ * n: num. of bytes to reallocated
+ * name: message with details about pointer, used for warnings/errors
+ *
+ * returns: ptr to reallocated memory
+ * */
+void* srealloc( void *ptr, size_t n, const char *name )
 {
-    void *ptr;
+    void *new_ptr;
 
-    if ( n <= 0 )
+    if ( n == 0 )
+    {
+        fprintf( stderr, "[ERROR] failed to reallocate %zu bytes for array %s.\n",
+                n, name );
+        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+    }
+
+    if ( ptr == NULL )
     {
-        fprintf( stderr, "WARNING: trying to allocate %d elements for array %s. ",
-                 n, name );
-        fprintf( stderr, "returning NULL.\n" );
-        return NULL;
+        fprintf( stderr, "[INFO] trying to allocate %zu NEW bytes for array %s.\n",
+                n, name );
     }
 
-    if ( size <= 0 )
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] requesting memory for %s\n", name );
+#endif
+
+    new_ptr = realloc( ptr, n );
+
+    /* technically, ptr is still allocated and valid in this case,
+     * but we needed more memory, so abort */
+    if ( new_ptr == NULL )
     {
-        fprintf( stderr, "WARNING: elements size for array %s is %d. ",
-                 name, size );
-        fprintf( stderr, "returning NULL.\n" );
-        return NULL;
+        fprintf( stderr, "[ERROR] failed to reallocate %zu bytes for array %s.\n",
+                n, name );
+        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
     }
 
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] successfuly assigned pointer for %s\n", name );
+#endif
+
+    return new_ptr;
+}
+
+
+/* Safe wrapper around libc calloc
+ *
+ * n: num. of elements to allocated (each of size bytes)
+ * size: num. of bytes per element
+ * name: message with details about pointer, used for warnings/errors
+ *
+ * returns: ptr to allocated memory, all bits initialized to zeros
+ * */
+void* scalloc( size_t n, size_t size, const char *name )
+{
+    void *ptr;
+
+    if ( n == 0 )
+    {
+        fprintf( stderr, "[ERROR] failed to allocate %zu bytes for array %s.\n",
+                n * size, name );
+        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] requesting memory for %s\n", name );
+#endif
+
     ptr = calloc( n, size );
+
     if ( ptr == NULL )
     {
-        fprintf( stderr, "ERROR: failed to allocate %d bytes for array %s",
-                 n * size, name );
+        fprintf( stderr, "[ERROR] failed to allocate %zu bytes for array %s.\n",
+                n * size, name );
         MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
     }
 
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] successfuly assigned pointer for %s\n", name );
+#endif
+
     return ptr;
 }
 
 
-/* safe free */
-void sfree( void *ptr, char *name )
+/* Safe wrapper around libc free
+ *
+ * ptr: pointer to dynamically allocated memory which will be deallocated
+ * name: message with details about pointer, used for warnings/errors
+ * */
+void sfree( void *ptr, const char *name )
 {
     if ( ptr == NULL )
     {
-        fprintf( stderr, "WARNING: trying to free the already NULL pointer %s!\n",
-                 name );
+        fprintf( stderr, "[WARNING] trying to free the already NULL pointer %s!\n",
+                name );
         return;
     }
 
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] freeing memory for %s\n", name );
+#endif
+
     free( ptr );
+
+#if defined(DEBUG)
+    fprintf( stderr, "[INFO] successfuly freed memory from pointer for %s\n", name );
+#endif
+
     ptr = NULL;
 }
diff --git a/PG-PuReMD/src/tool_box.h b/PG-PuReMD/src/tool_box.h
index 451a682b3a81221b85c3f154dcacec4cb30128b8..717a8a80e490171e8b6aa4249dd432d5ad7e3da7 100644
--- a/PG-PuReMD/src/tool_box.h
+++ b/PG-PuReMD/src/tool_box.h
@@ -24,55 +24,57 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* from comm_tools.h */
 int SumScan( int, int, int, MPI_Comm );
+
 void SumScanB( int, int, int, int, MPI_Comm, int* );
 
-/* from box.h */
-void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
 void Fit_to_Periodic_Box( simulation_box*, rvec* );
-static inline void Box_Touch_Point( simulation_box*, ivec, rvec );
-static inline int  is_Inside_Box( simulation_box*, rvec );
-static inline int  iown_midpoint( simulation_box*, rvec, rvec );
-
-/* from grid.h */
-static inline void GridCell_Closest_Point( grid_cell*, grid_cell*, ivec, ivec, rvec );
-static inline void GridCell_to_Box_Points( grid_cell*, ivec, rvec, rvec );
-static inline real DistSqr_between_Special_Points( rvec, rvec );
-static inline real DistSqr_to_Special_Point( rvec, rvec );
-static inline int Relative_Coord_Encoding( ivec );
 
 /* from geo_tools.h */
 void Make_Point( real, real, real, rvec* );
+
 int is_Valid_Serial( storage*, int );
+
 int Check_Input_Range( int, int, int, char* );
+
 void Trim_Spaces( char* );
 
 /* from system_props.h */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 real Get_Time( );
-real Get_Timing_Info( real );
-void Update_Timing_Info( real*, real* );
 
-#ifdef __cplusplus
-}
-#endif
+real Get_Timing_Info( real );
 
+void Update_Timing_Info( real*, real* );
 
 /* from io_tools.h */
-int   Get_Atom_Type( reax_interaction*, char* );
+int Get_Atom_Type( reax_interaction*, char* );
+
 char *Get_Element( reax_system*, int );
+
 char *Get_Atom_Name( reax_system*, int );
-int   Allocate_Tokenizer_Space( char**, char**, char*** );
-int   Tokenize( char*, char*** );
+
+void Allocate_Tokenizer_Space( char**, char**, char*** );
+
+int Tokenize( const char*, char*** );
 
 /* from lammps */
-void *smalloc( long, char* );
-void *scalloc( int, int, char* );
-void sfree( void*, char* );
+void *smalloc( size_t, const char* );
+
+void* srealloc( void *, size_t, const char * );
+
+void *scalloc( size_t, size_t, const char* );
+
+void sfree( void*, const char* );
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
@@ -83,12 +85,20 @@ static inline void Box_Touch_Point( simulation_box *box, ivec rl, rvec tp )
     int d;
 
     for ( d = 0; d < 3; ++d )
+    {
         if ( rl[d] == -1 )
+        {
             tp[d] = box->min[d];
+        }
         else if ( rl[d] == 0 )
+        {
             tp[d] = NEG_INF - 1.;
+        }
         else
+        {
             tp[d] = box->max[d];
+        }
+    }
 }
 
 
@@ -99,9 +109,11 @@ static inline int is_Inside_Box( simulation_box *box, rvec p )
     if ( p[0] < box->min[0] || p[0] >= box->max[0] ||
             p[1] < box->min[1] || p[1] >= box->max[1] ||
             p[2] < box->min[2] || p[2] >= box->max[2] )
-        return 0;
+    {
+        return FALSE;
+    }
 
-    return 1;
+    return TRUE;
 }
 
 
@@ -116,9 +128,11 @@ static inline int iown_midpoint( simulation_box *box, rvec p1, rvec p2 )
     if ( midp[0] < box->min[0] || midp[0] >= box->max[0] ||
             midp[1] < box->min[1] || midp[1] >= box->max[1] ||
             midp[2] < box->min[2] || midp[2] >= box->max[2] )
-        return 0;
+    {
+        return FALSE;
+    }
 
-    return 1;
+    return TRUE;
 }
 
 
@@ -128,27 +142,42 @@ static inline void GridCell_Closest_Point( grid_cell *gci, grid_cell *gcj,
     int  d;
 
     for ( d = 0; d < 3; d++ )
+    {
         if ( cj[d] > ci[d] )
+        {
             cp[d] = gcj->min[d];
+        }
         else if ( cj[d] == ci[d] )
+        {
             cp[d] = NEG_INF - 1.;
+        }
         else
+        {
             cp[d] = gcj->max[d];
+        }
+    }
 }
 
 
-
 static inline void GridCell_Touch_Point( grid_cell *gc, ivec rl, rvec fp )
 {
     int d;
 
     for ( d = 0; d < 3; ++d )
+    {
         if ( rl[d] == -1 )
+        {
             fp[d] = gc->min[d];
+        }
         else if ( rl[d] == 0 )
+        {
             fp[d] = NEG_INF - 1.;
+        }
         else
+        {
             fp[d] = gc->max[d];
+        }
+    }
 }
 
 
@@ -159,8 +188,12 @@ static inline real DistSqr_to_CP( rvec cp, rvec x )
     real d_sqr = 0;
 
     for ( i = 0; i < 3; ++i )
+    {
         if ( cp[i] > NEG_INF )
+        {
             d_sqr += SQR( cp[i] - x[i] );
+        }
+    }
 
     return d_sqr;
 }
@@ -171,17 +204,68 @@ static inline int Relative_Coord_Encoding( ivec c )
     return 9 * (c[0] + 1) + 3 * (c[1] + 1) + (c[2] + 1);
 }
 
+
 static inline real DistSqr_to_Special_Point( rvec cp, rvec x )
 {
     int  i;
     real d_sqr = 0;
 
     for ( i = 0; i < 3; ++i )
+    {
         if ( cp[i] > NEG_INF )
+        {
             d_sqr += SQR( cp[i] - x[i] );
+        }
+    }
 
     return d_sqr;
 }
+
+
+/************** taken from box.c **************/
+CUDA_HOST_DEVICE static inline void Transform( rvec x1,
+        simulation_box *box, char flag, rvec x2 )
+{
+    int i, j;
+    real tmp;
+
+    if ( flag > 0 )
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            tmp = 0.0;
+            for ( j = 0; j < 3; j++ )
+            {
+                tmp += box->trans[i][j] * x1[j];
+            }
+            x2[i] = tmp;
+        }
+    }
+    else
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            tmp = 0.0;
+            for ( j = 0; j < 3; j++ )
+            {
+                tmp += box->trans_inv[i][j] * x1[j];
+            }
+            x2[i] = tmp;
+        }
+    }
+}
+
+
+CUDA_HOST_DEVICE static inline void Transform_to_UnitBox( rvec x1,
+        simulation_box *box, char flag, rvec x2 )
+{
+    Transform( x1, box, flag, x2 );
+
+    x2[0] /= box->box_norms[0];
+    x2[1] /= box->box_norms[1];
+    x2[2] /= box->box_norms[2];
+}
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/torsion_angles.c b/PG-PuReMD/src/torsion_angles.c
index 132e76a92d6ee27b953c039b611f756a87d4b452..29cfb4444a97ea59a91b6f1fac2c5ccc55877a4c 100644
--- a/PG-PuReMD/src/torsion_angles.c
+++ b/PG-PuReMD/src/torsion_angles.c
@@ -20,32 +20,31 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "torsion_angles.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "torsion_angles.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_torsion_angles.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_torsion_angles.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 #define MIN_SINE 1e-10
 
-real Calculate_Omega( rvec dvec_ij, real r_ij,
-                      rvec dvec_jk, real r_jk,
-                      rvec dvec_kl, real r_kl,
-                      rvec dvec_li, real r_li,
-                      three_body_interaction_data *p_ijk,
-                      three_body_interaction_data *p_jkl,
-                      rvec dcos_omega_di, rvec dcos_omega_dj,
-                      rvec dcos_omega_dk, rvec dcos_omega_dl,
-                      output_controls *out_control )
+
+real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
+        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk, three_body_interaction_data *p_jkl,
+        rvec dcos_omega_di, rvec dcos_omega_dj, rvec dcos_omega_dk, rvec dcos_omega_dl,
+        output_controls *out_control )
 {
     real unnorm_cos_omega, unnorm_sin_omega, omega;
     real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
@@ -53,10 +52,10 @@ real Calculate_Omega( rvec dvec_ij, real r_ij,
     real arg, poem, tel;
     rvec cross_jk_kl;
 
-    sin_ijk = sin( p_ijk->theta );
-    cos_ijk = cos( p_ijk->theta );
-    sin_jkl = sin( p_jkl->theta );
-    cos_jkl = cos( p_jkl->theta );
+    sin_ijk = SIN( p_ijk->theta );
+    cos_ijk = COS( p_ijk->theta );
+    sin_jkl = SIN( p_jkl->theta );
+    cos_jkl = COS( p_jkl->theta );
 
     /* omega */
     unnorm_cos_omega = -rvec_Dot(dvec_ij, dvec_jk) * rvec_Dot(dvec_jk, dvec_kl) +
@@ -65,8 +64,7 @@ real Calculate_Omega( rvec dvec_ij, real r_ij,
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
 
-    omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
-
+    omega = ATAN2( unnorm_sin_omega, unnorm_cos_omega );
 
     /* derivatives */
     /* coef for adjusments to cos_theta's */
@@ -83,18 +81,25 @@ real Calculate_Omega( rvec dvec_ij, real r_ij,
     hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
     hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
 
-
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if ( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 )
+    {
+        poem = 1e-20;
+    }
 
     tel  = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) -
            2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
                    r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if ( arg >  1.0 ) arg =  1.0;
-    if ( arg < -1.0 ) arg = -1.0;
-
+    if ( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if ( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
 
     /* fprintf( out_control->etor,
        "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
@@ -116,10 +121,22 @@ real Calculate_Omega( rvec dvec_ij, real r_ij,
        -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl,
        -p_jkl->dcos_dk[2]/sin_jkl );*/
 
-    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    {
+        sin_ijk = MIN_SINE;
+    }
+    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    {
+        sin_ijk = -MIN_SINE;
+    }
+    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    {
+        sin_jkl = MIN_SINE;
+    }
+    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    {
+        sin_jkl = -MIN_SINE;
+    }
 
     // dcos_omega_di
     rvec_ScaledSum( dcos_omega_di, (htra - arg * hnra) / r_ij, dvec_ij, -1., dvec_li );
@@ -149,19 +166,11 @@ real Calculate_Omega( rvec dvec_ij, real r_ij,
 }
 
 
-
-
-
-
-real Old_Calculate_Omega( rvec dvec_ij, real r_ij,
-                      rvec dvec_jk, real r_jk,
-                      rvec dvec_kl, real r_kl,
-                      rvec dvec_li, real r_li,
-                      three_body_interaction_data *p_ijk,
-                      three_body_interaction_data *p_jkl,
-                      rvec dcos_omega_di, rvec dcos_omega_dj,
-                      rvec dcos_omega_dk, rvec dcos_omega_dl,
-                      output_controls *out_control )
+real Old_Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
+        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk, three_body_interaction_data *p_jkl,
+        rvec dcos_omega_di, rvec dcos_omega_dj, rvec dcos_omega_dk, rvec dcos_omega_dl,
+        output_controls *out_control )
 {
     real unnorm_cos_omega, unnorm_sin_omega, omega;
     real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
@@ -181,7 +190,7 @@ real Old_Calculate_Omega( rvec dvec_ij, real r_ij,
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
 
-    omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
+    omega = ATAN2( unnorm_sin_omega, unnorm_cos_omega );
 
 
     /* derivatives */
@@ -201,16 +210,24 @@ real Old_Calculate_Omega( rvec dvec_ij, real r_ij,
 
 
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if ( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 )
+    {
+        poem = 1e-20;
+    }
 
     tel  = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) -
            2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
                    r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if ( arg >  1.0 ) arg =  1.0;
-    if ( arg < -1.0 ) arg = -1.0;
-
+    if ( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if ( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
 
     /* fprintf( out_control->etor,
        "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
@@ -232,10 +249,22 @@ real Old_Calculate_Omega( rvec dvec_ij, real r_ij,
        -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl,
        -p_jkl->dcos_dk[2]/sin_jkl );*/
 
-    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    {
+        sin_ijk = MIN_SINE;
+    }
+    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    {
+        sin_ijk = -MIN_SINE;
+    }
+    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    {
+        sin_jkl = MIN_SINE;
+    }
+    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    {
+        sin_jkl = -MIN_SINE;
+    }
 
     // dcos_omega_di
     rvec_ScaledSum( dcos_omega_di, (htra - arg * hnra) / r_ij, dvec_ij, -1., dvec_li );
@@ -264,10 +293,11 @@ real Old_Calculate_Omega( rvec dvec_ij, real r_ij,
     return omega;
 }
 
+
 // Basically a copy from PuReMD, because Old_Torsion_Angles had some issue
 void Torsion_Angles( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
     int type_i, type_j, type_k, type_l;
@@ -358,10 +388,10 @@ void Torsion_Angles( reax_system *system, control_params *control,
                     start_pj = Start_Index(pj, thb_intrs );
                     end_pj = End_Index(pj, thb_intrs );
 
-                    exp_tor2_jk = exp( -p_tor2 * BOA_jk );
-                    exp_cot2_jk = exp( -p_cot2 * SQR(BOA_jk - 1.5) );
-                    exp_tor3_DjDk = exp( -p_tor3 * (Delta_j + Delta_k) );
-                    exp_tor4_DjDk = exp( p_tor4  * (Delta_j + Delta_k) );
+                    exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                    exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                    exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                    exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
                     exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
                     f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
 
@@ -389,17 +419,24 @@ void Torsion_Angles( reax_system *system, control_params *control,
                             BOA_ij = bo_ij->BO - control->thb_cut;
 
                             theta_ijk = p_ijk->theta;
-                            sin_ijk = sin( theta_ijk );
-                            cos_ijk = cos( theta_ijk );
+                            sin_ijk = SIN( theta_ijk );
+                            cos_ijk = COS( theta_ijk );
                             //tan_ijk_i = 1. / tan( theta_ijk );
                             if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+                            {
                                 tan_ijk_i = cos_ijk / MIN_SINE;
+                            }
                             else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+                            {
                                 tan_ijk_i = cos_ijk / -MIN_SINE;
-                            else tan_ijk_i = cos_ijk / sin_ijk;
+                            }
+                            else
+                            {
+                                tan_ijk_i = cos_ijk / sin_ijk;
+                            }
 
-                            exp_tor2_ij = exp( -p_tor2 * BOA_ij );
-                            exp_cot2_ij = exp( -p_cot2 * SQR(BOA_ij - 1.5) );
+                            exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
 
 
                             /* pick l up from j-k interaction where k is the central atom */
@@ -431,14 +468,21 @@ void Torsion_Angles( reax_system *system, control_params *control,
                                     BOA_kl = bo_kl->BO - control->thb_cut;
 
                                     theta_jkl = p_jkl->theta;
-                                    sin_jkl = sin( theta_jkl );
-                                    cos_jkl = cos( theta_jkl );
+                                    sin_jkl = SIN( theta_jkl );
+                                    cos_jkl = COS( theta_jkl );
                                     //tan_jkl_i = 1. / tan( theta_jkl );
                                     if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+                                    {
                                         tan_jkl_i = cos_jkl / MIN_SINE;
+                                    }
                                     else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+                                    {
                                         tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl / sin_jkl;
+                                    }
+                                    else
+                                    {
+                                        tan_jkl_i = cos_jkl / sin_jkl;
+                                    }
 
                                     rvec_ScaledSum( dvec_li, 1., system->my_atoms[i].x,
                                                     -1., system->my_atoms[l].x );
@@ -461,10 +505,10 @@ void Torsion_Angles( reax_system *system, control_params *control,
                                     /* end omega calculations */
 
                                     /* torsion energy */
-                                    exp_tor1 = exp( fbp->p_tor1 *
+                                    exp_tor1 = EXP( fbp->p_tor1 *
                                                     SQR(2.0 - bo_jk->BO_pi - f11_DjDk) );
-                                    exp_tor2_kl = exp( -p_tor2 * BOA_kl );
-                                    exp_cot2_kl = exp( -p_cot2 * SQR(BOA_kl - 1.5) );
+                                    exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
                                     fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
                                            (1.0 - exp_tor2_kl);
 
@@ -730,17 +774,11 @@ void Torsion_Angles( reax_system *system, control_params *control,
              data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
-                                                                                 	
-
-
-
-
-
 
 
 void Old_Torsion_Angles( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
     int type_i, type_j, type_k, type_l;
@@ -857,10 +895,17 @@ void Old_Torsion_Angles( reax_system *system, control_params *control,
                             cos_ijk = COS( theta_ijk );
                             //tan_ijk_i = 1. / TAN( theta_ijk );
                             if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+                            {
                                 tan_ijk_i = cos_ijk / MIN_SINE;
+                            }
                             else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+                            {
                                 tan_ijk_i = cos_ijk / -MIN_SINE;
-                            else tan_ijk_i = cos_ijk / sin_ijk;
+                            }
+                            else
+                            {
+                                tan_ijk_i = cos_ijk / sin_ijk;
+                            }
 
                             exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
                             exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
@@ -895,10 +940,17 @@ void Old_Torsion_Angles( reax_system *system, control_params *control,
                                     cos_jkl = COS( theta_jkl );
                                     //tan_jkl_i = 1. / TAN( theta_jkl );
                                     if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+                                    {
                                         tan_jkl_i = cos_jkl / MIN_SINE;
+                                    }
                                     else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+                                    {
                                         tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl / sin_jkl;
+                                    }
+                                    else
+                                    {
+                                        tan_jkl_i = cos_jkl / sin_jkl;
+                                    }
 
                                     rvec_ScaledSum( dvec_li, 1., system->my_atoms[i].x,
                                                     -1., system->my_atoms[l].x );
diff --git a/PG-PuReMD/src/torsion_angles.h b/PG-PuReMD/src/torsion_angles.h
index d0762a4e2f693877da9e60a8bbc44ea2f5b714b0..454f06791e6ac3cd3fba970090ada1b07c515f6f 100644
--- a/PG-PuReMD/src/torsion_angles.h
+++ b/PG-PuReMD/src/torsion_angles.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Torsion_Angles( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index 311af0df2e063ea104a7fbf1c7d9a9f3f47e0beb..14560661f57d23918774095a4df4c2d3a41fae4b 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -20,49 +20,58 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "traj.h"
-#include "list.h"
-#include "tool_box.h"
+  #include "traj.h"
+  #include "list.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_traj.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
+  #include "reax_traj.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+#endif
+
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_copy.h"
 #endif
 
 
 int Set_My_Trajectory_View( MPI_File trj, int offset, MPI_Datatype etype,
-                            MPI_Comm comm, int my_rank, int my_n, int big_n )
+        MPI_Comm comm, int my_rank, int my_n, int big_n )
 {
     int my_disp;
     int length[3];
-    MPI_Aint line_len;
+    MPI_Aint lower_bound, extent;
     MPI_Aint disp[3];
     MPI_Datatype type[3];
     MPI_Datatype view;
 
-    /* line length inferred from etype */
-    MPI_Type_extent( etype, &line_len );
-    line_len /= sizeof(char);
+    /* get old type info */
+    MPI_Type_get_extent( etype, &lower_bound, &extent );
 
     /* determine where to start writing into the mpi file */
     my_disp = SumScan( my_n, my_rank, MASTER_NODE, comm );
     my_disp -= my_n;
+    extent /= sizeof(char);
 
-    /* create atom_info_view */
     length[0] = 1;
     length[1] = my_n;
     length[2] = 1;
     disp[0] = 0;
-    disp[1] = line_len * my_disp;
-    disp[2] = line_len * big_n;
+    disp[1] = extent * my_disp;
+    disp[2] = extent * big_n;
     type[0] = MPI_LB;
     type[1] = etype;
     type[2] = MPI_UB;
 
-    MPI_Type_struct( 3, length, disp, type, &view );
+    MPI_Type_create_struct( 3, length, disp, type, &view );
+
+    //TODO: change due to deprecation of MPI_LB/MPI_UB
+//    MPI_Type_create_resized( etype, lower_bound, extent, &view );
+
     MPI_Type_commit( &view );
 
+    /* create atom_info_view */
     MPI_File_set_view( trj, offset, etype, view, "native", MPI_INFO_NULL );
 
     return my_disp;
@@ -70,13 +79,16 @@ int Set_My_Trajectory_View( MPI_File trj, int offset, MPI_Datatype etype,
 
 
 int Reallocate_Output_Buffer( output_controls *out_control, int req_space,
-                              MPI_Comm comm )
+        MPI_Comm comm )
 {
     if ( out_control->buffer_len > 0 )
-        free( out_control->buffer );
+    {
+        sfree( out_control->buffer, "Reallocate_Output_Buffer::out_control->buffer" );
+    }
 
     out_control->buffer_len = req_space * SAFE_ZONE;
-    out_control->buffer = (char*) malloc(out_control->buffer_len * sizeof(char));
+    out_control->buffer = (char*) smalloc( out_control->buffer_len * sizeof(char),
+            "Reallocate_Output_Buffer::out_control->buffer" );
     if ( out_control->buffer == NULL )
     {
         fprintf( stderr,
@@ -90,7 +102,7 @@ int Reallocate_Output_Buffer( output_controls *out_control, int req_space,
 
 
 void Write_Skip_Line( output_controls *out_control, mpi_datatypes *mpi_data,
-                      int my_rank, int skip, int num_section )
+        int my_rank, int skip, int num_section )
 {
     MPI_Status status;
 
@@ -122,33 +134,41 @@ int Write_Header( reax_system *system, control_params *control,
 {
     int  num_hdr_lines, my_hdr_lines, buffer_req;
     MPI_Status status;
-    char ensembles[ens_N][25] =  { "NVE", "NVT", "fully flexible NPT",
-                                   "semi isotropic NPT", "isotropic NPT"
-                                 };
-    char reposition[3][25] = { "fit to periodic box", "CoM to center of box",
-                               "CoM to origin"
-                             };
-    char t_regime[3][25] = { "T-coupling only", "step-wise", "constant slope" };
-
-    char traj_methods[TF_N][10] = { "custom", "xyz" };
-    char atom_formats[8][40] =  { "none", "invalid", "invalid", "invalid",
-                                  "xyz_q",
-                                  "xyz_q_fxfyfz",
-                                  "xyz_q_vxvyvz",
-                                  "detailed_atom_info"
-                                };
-    char bond_formats[3][30] = { "none",
-                                 "basic_bond_info",
-                                 "detailed_bond_info"
-                               };
-    char angle_formats[2][30] = { "none", "basic_angle_info" };
+    char ensembles[ens_N][25] = {
+        "NVE", "NVT", "fully flexible NPT",
+        "semi isotropic NPT", "isotropic NPT",
+    };
+    char reposition[3][25] = {
+        "fit to periodic box", "CoM to center of box",
+        "CoM to origin",
+    };
+    char t_regime[3][25] = {
+        "T-coupling only", "step-wise", "constant slope",
+    };
+
+    char traj_methods[TF_N][10] = {
+        "custom", "xyz",
+    };
+    char atom_formats[8][40] =  {
+        "none", "invalid", "invalid", "invalid",
+        "xyz_q", "xyz_q_fxfyfz", "xyz_q_vxvyvz",
+        "detailed_atom_info",
+    };
+    char bond_formats[3][30] = {
+        "none", "basic_bond_info", "detailed_bond_info",
+    };
+    char angle_formats[2][30] = {
+        "none", "basic_angle_info",
+    };
 
     /* set header lengths */
     num_hdr_lines = NUM_HEADER_LINES;
     my_hdr_lines = num_hdr_lines * ( system->my_rank == MASTER_NODE );
     buffer_req = my_hdr_lines * HEADER_LINE_LEN;
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     /* only the master node writes into trajectory header */
     if ( system->my_rank == MASTER_NODE )
@@ -253,7 +273,7 @@ int Write_Header( reax_system *system, control_params *control,
                  control->thb_cut );
         strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 );
 
-        sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->q_err );
+        sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->cm_solver_q_err );
         strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 );
 
         /* temperature controls */
@@ -330,11 +350,11 @@ int Write_Header( reax_system *system, control_params *control,
     {
         out_control->trj_offset = 0;
         Set_My_Trajectory_View( out_control->trj,
-                                out_control->trj_offset, mpi_data->header_line,
-                                mpi_data->world, system->my_rank,
-                                my_hdr_lines, num_hdr_lines );
+                out_control->trj_offset, mpi_data->header_line,
+                mpi_data->world, system->my_rank,
+                my_hdr_lines, num_hdr_lines );
         MPI_File_write_all( out_control->trj, out_control->buffer,
-                            num_hdr_lines, mpi_data->header_line, &status );
+                num_hdr_lines, mpi_data->header_line, &status );
         out_control->trj_offset = (num_hdr_lines) * HEADER_LINE_LEN;
     }
     else
@@ -348,7 +368,7 @@ int Write_Header( reax_system *system, control_params *control,
 
 
 int Write_Init_Desc( reax_system *system, control_params *control,
-                     output_controls *out_control, mpi_datatypes *mpi_data )
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int i, me, np, cnt, buffer_len, buffer_req;
     reax_atom *p_atom;
@@ -363,11 +383,18 @@ int Write_Init_Desc( reax_system *system, control_params *control,
                      system->bigN * INIT_DESC_LEN, system->bigN );
 
     if ( out_control->traj_method == REG_TRAJ && me == MASTER_NODE )
+    {
         buffer_req = system->bigN * INIT_DESC_LEN + 1;
-    else buffer_req = system->n * INIT_DESC_LEN + 1;
+    }
+    else
+    {
+        buffer_req = system->n * INIT_DESC_LEN + 1;
+    }
 
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     out_control->line[0] = 0;
     out_control->buffer[0] = 0;
@@ -384,10 +411,10 @@ int Write_Init_Desc( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->init_desc_line, mpi_data->world,
-                                me, system->n, system->bigN );
+                mpi_data->init_desc_line, mpi_data->world,
+                me, system->n, system->bigN );
         MPI_File_write( out_control->trj, out_control->buffer, system->n,
-                        mpi_data->init_desc_line, &status );
+                mpi_data->init_desc_line, &status );
         out_control->trj_offset += system->bigN * INIT_DESC_LEN;
     }
     else
@@ -416,16 +443,15 @@ int Write_Init_Desc( reax_system *system, control_params *control,
 
 
 int Init_Traj( reax_system *system, control_params *control,
-               output_controls *out_control, mpi_datatypes *mpi_data,
-               char *msg )
+        output_controls *out_control, mpi_datatypes *mpi_data, char *msg )
 {
     char fname[MAX_STR];
-    int  atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0,
-                                          ATOM_BASIC_LEN, ATOM_wV_LEN,
-                                          ATOM_wF_LEN, ATOM_FULL_LEN
-                                        };
-    int  bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN };
-    int  angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN };
+    int atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0,
+        ATOM_BASIC_LEN, ATOM_wV_LEN,
+        ATOM_wF_LEN, ATOM_FULL_LEN
+    };
+    int bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN };
+    int angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN };
 
     /* generate trajectory name */
     sprintf( fname, "%s.trj", control->sim_name );
@@ -441,7 +467,8 @@ int Init_Traj( reax_system *system, control_params *control,
     out_control->write_angles = ( out_control->angle_line_len ? 1 : 0 );
 
     /* allocate line & buffer space */
-    out_control->line = (char*) calloc( MAX_TRJ_LINE_LEN + 1, sizeof(char) );
+    out_control->line = (char*) scalloc( MAX_TRJ_LINE_LEN + 1, sizeof(char),
+           "Init_Traj::out_control->line" );
     out_control->buffer_len = 0;
     out_control->buffer = NULL;
 
@@ -504,11 +531,15 @@ int Init_Traj( reax_system *system, control_params *control,
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: initiated trajectory\n", system->my_rank );
 #endif
+
     Write_Header( system, control, out_control, mpi_data );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: header written\n", system->my_rank );
 #endif
+
     Write_Init_Desc( system, control, out_control, mpi_data );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: atom descriptions written\n", system->my_rank );
 #endif
@@ -518,8 +549,8 @@ int Init_Traj( reax_system *system, control_params *control,
 
 
 int Write_Frame_Header( reax_system *system, control_params *control,
-                        simulation_data *data, output_controls *out_control,
-                        mpi_datatypes *mpi_data )
+        simulation_data *data, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int me, num_frm_hdr_lines, my_frm_hdr_lines, buffer_req;
     MPI_Status status;
@@ -530,7 +561,9 @@ int Write_Frame_Header( reax_system *system, control_params *control,
     my_frm_hdr_lines = num_frm_hdr_lines * ( me == MASTER_NODE );
     buffer_req = my_frm_hdr_lines * HEADER_LINE_LEN;
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     /* only the master node writes into trajectory header */
     if ( me == MASTER_NODE )
@@ -640,11 +673,11 @@ int Write_Frame_Header( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->header_line, mpi_data->world,
-                                me, my_frm_hdr_lines, num_frm_hdr_lines );
+                mpi_data->header_line, mpi_data->world,
+                me, my_frm_hdr_lines, num_frm_hdr_lines );
 
         MPI_File_write_all(out_control->trj, out_control->buffer, my_frm_hdr_lines,
-                           mpi_data->header_line, &status);
+                mpi_data->header_line, &status);
         out_control->trj_offset += (num_frm_hdr_lines) * HEADER_LINE_LEN;
     }
     else
@@ -657,27 +690,33 @@ int Write_Frame_Header( reax_system *system, control_params *control,
 }
 
 
-
 int Write_Atoms( reax_system *system, control_params *control,
-                 output_controls *out_control, mpi_datatypes *mpi_data )
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int i, me, np, line_len, buffer_len, buffer_req, cnt;
-    MPI_Status  status;
-    reax_atom  *p_atom;
+    MPI_Status status;
+    reax_atom *p_atom;
 
     me = system->my_rank;
     np = control->nprocs;
     line_len = out_control->atom_line_len;
 
     Write_Skip_Line( out_control, mpi_data, me,
-                     system->bigN * line_len, system->bigN );
+            system->bigN * line_len, system->bigN );
 
     if ( out_control->traj_method == REG_TRAJ && me == MASTER_NODE )
+    {
         buffer_req = system->bigN * line_len + 1;
-    else buffer_req = system->n * line_len + 1;
+    }
+    else
+    {
+        buffer_req = system->n * line_len + 1;
+    }
 
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     /* fill in buffer */
     out_control->line[0] = 0;
@@ -721,10 +760,10 @@ int Write_Atoms( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->atom_line, mpi_data->world,
-                                me, system->n, system->bigN );
+                mpi_data->atom_line, mpi_data->world,
+                me, system->n, system->bigN );
         MPI_File_write( out_control->trj, out_control->buffer, system->n,
-                        mpi_data->atom_line, &status );
+                mpi_data->atom_line, &status );
         out_control->trj_offset += (system->bigN) * out_control->atom_line_len;
     }
     else
@@ -752,8 +791,8 @@ int Write_Atoms( reax_system *system, control_params *control,
 }
 
 
-int Write_Bonds(reax_system *system, control_params *control, reax_list *bonds,
-                output_controls *out_control, mpi_datatypes *mpi_data)
+int Write_Bonds( reax_system *system, control_params *control, reax_list *bonds,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int i, j, pj, me, np;
     int my_bonds, num_bonds;
@@ -781,11 +820,18 @@ int Write_Bonds(reax_system *system, control_params *control, reax_list *bonds,
     Write_Skip_Line( out_control, mpi_data, me, num_bonds * line_len, num_bonds );
 
     if ( out_control->traj_method == REG_TRAJ && me == MASTER_NODE )
+    {
         buffer_req = num_bonds * line_len + 1;
-    else buffer_req = my_bonds * line_len + 1;
+    }
+    else
+    {
+        buffer_req = my_bonds * line_len + 1;
+    }
 
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     /* fill in the buffer */
     my_bonds = 0;
@@ -827,10 +873,10 @@ int Write_Bonds(reax_system *system, control_params *control, reax_list *bonds,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->bond_line, mpi_data->world,
-                                me, my_bonds, num_bonds );
+                mpi_data->bond_line, mpi_data->world,
+                me, my_bonds, num_bonds );
         MPI_File_write( out_control->trj, out_control->buffer, my_bonds,
-                        mpi_data->bond_line, &status );
+                mpi_data->bond_line, &status );
         out_control->trj_offset += num_bonds * line_len;
     }
     else
@@ -859,8 +905,8 @@ int Write_Bonds(reax_system *system, control_params *control, reax_list *bonds,
 
 
 int Write_Angles( reax_system *system, control_params *control,
-                  reax_list *bonds, reax_list *thb_intrs,
-                  output_controls *out_control, mpi_datatypes *mpi_data )
+        reax_list *bonds, reax_list *thb_intrs,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int i, j, k, pi, pk, me, np;
     int my_angles, num_angles;
@@ -900,11 +946,18 @@ int Write_Angles( reax_system *system, control_params *control,
     Write_Skip_Line( out_control, mpi_data, me, num_angles * line_len, num_angles );
 
     if ( out_control->traj_method == REG_TRAJ && me == MASTER_NODE )
+    {
         buffer_req = num_angles * line_len + 1;
-    else buffer_req = my_angles * line_len + 1;
+    }
+    else
+    {
+        buffer_req = my_angles * line_len + 1;
+    }
 
     if ( buffer_req > out_control->buffer_len * DANGER_ZONE )
+    {
         Reallocate_Output_Buffer( out_control, buffer_req, mpi_data->world );
+    }
 
     /* fill in the buffer */
     my_angles = 0;
@@ -941,10 +994,10 @@ int Write_Angles( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->angle_line, mpi_data->world,
-                                me, my_angles, num_angles );
+                mpi_data->angle_line, mpi_data->world,
+                me, my_angles, num_angles );
         MPI_File_write( out_control->trj, out_control->buffer, my_angles,
-                        mpi_data->angle_line, &status );
+                mpi_data->angle_line, &status );
         out_control->trj_offset += num_angles * line_len;
     }
     else
@@ -973,8 +1026,8 @@ int Write_Angles( reax_system *system, control_params *control,
 
 
 int Append_Frame( reax_system *system, control_params *control,
-                  simulation_data *data, reax_list **lists,
-                  output_controls *out_control, mpi_datatypes *mpi_data )
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: appending frame %d\n", system->my_rank, data->step );
@@ -983,31 +1036,29 @@ int Append_Frame( reax_system *system, control_params *control,
 
     if ( out_control->write_atoms )
     {
-        //Sync atoms here
 #ifdef HAVE_CUDA
-        Output_Sync_Atoms ( system );
+        Output_Sync_Atoms( system );
 #endif
         Write_Atoms( system, control, out_control, mpi_data );
     }
 
     if ( out_control->write_bonds )
     {
-        //sync bonds here
 #ifdef HAVE_CUDA
-        Output_Sync_Lists ((*lists + BONDS), (*dev_lists + BONDS), TYP_BOND);
+        Output_Sync_Lists( (*lists + BONDS), (*dev_lists + BONDS), TYP_BOND );
 #endif
         Write_Bonds( system, control, (*lists + BONDS), out_control, mpi_data );
     }
 
     if ( out_control->write_angles )
     {
-        //sync three body interactions here
 #ifdef HAVE_CUDA
-        Output_Sync_Lists ((*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY);
+        Output_Sync_Lists( (*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY );
 #endif
         Write_Angles( system, control, (*lists + BONDS), (*lists + THREE_BODIES),
                       out_control, mpi_data );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: appended frame %d\n", system->my_rank, data->step );
 #endif
@@ -1019,12 +1070,16 @@ int Append_Frame( reax_system *system, control_params *control,
 int End_Traj( int my_rank, output_controls *out_control )
 {
     if ( out_control->traj_method == MPI_TRAJ )
+    {
         MPI_File_close( &(out_control->trj) );
+    }
     else if ( my_rank == MASTER_NODE )
+    {
         fclose( out_control->strj );
+    }
 
-    free( out_control->buffer );
-    free( out_control->line );
+    sfree( out_control->buffer, "End_Traj::out_control->buffer" );
+    sfree( out_control->line, "End_Traj::out_control->line" );
 
     return SUCCESS;
 }
diff --git a/PG-PuReMD/src/traj.h b/PG-PuReMD/src/traj.h
index 7ce82e0fff6572f4d343f9fd9832183fc13672c2..13435ecbfd071f6c5ff55f18f368a736967fffc9 100644
--- a/PG-PuReMD/src/traj.h
+++ b/PG-PuReMD/src/traj.h
@@ -62,16 +62,36 @@
 #define ANGLE_BASIC "%9d%9d%9d%10.3f\n" // Atom1 Atom2 Atom3 Theta
 #define ANGLE_BASIC_LEN 38
 
-enum ATOM_LINE_OPTS  { OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5, OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7, NR_OPT_ATOM = 8 };
-enum BOND_LINE_OPTS  { OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL, NR_OPT_BOND };
-enum ANGLE_LINE_OPTS { OPT_NOANGLE, OPT_ANGLE_BASIC, NR_OPT_ANGLE };
 
+enum ATOM_LINE_OPTS
+{
+    OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5, OPT_ATOM_wV = 6,
+    OPT_ATOM_FULL = 7, NR_OPT_ATOM = 8,
+};
+enum BOND_LINE_OPTS
+{
+    OPT_NOBOND = 0, OPT_BOND_BASIC = 1, OPT_BOND_FULL = 2, NR_OPT_BOND = 3,
+};
+enum ANGLE_LINE_OPTS
+{
+    OPT_NOANGLE = 0, OPT_ANGLE_BASIC = 1, NR_OPT_ANGLE = 2,
+};
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int Init_Traj( reax_system*, control_params*, output_controls*, mpi_datatypes*, char* );
+
+int End_Traj( int, output_controls* );
 
-int  Init_Traj( reax_system*, control_params*, output_controls*,
-                mpi_datatypes*, char* );
-int  End_Traj( int, output_controls* );
+int Append_Frame( reax_system*, control_params*, simulation_data*, reax_list**,
+        output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
 
-int  Append_Frame( reax_system*, control_params*, simulation_data*,
-                   reax_list**, output_controls*, mpi_datatypes* );
 
 #endif
diff --git a/PG-PuReMD/src/valence_angles.c b/PG-PuReMD/src/valence_angles.c
index 905fbbf82733dab2d9d06b5cc1fab2dbcbde584f..56a149f42e9b4caf16bea02acaf333217a600e48 100644
--- a/PG-PuReMD/src/valence_angles.c
+++ b/PG-PuReMD/src/valence_angles.c
@@ -20,42 +20,47 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "valence_angles.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "vector.h"
+  #include "valence_angles.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_valence_angles.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_vector.h"
+  #include "reax_valence_angles.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_vector.h"
 #endif
 
 
 /* calculates the theta angle between i-j-k */
 void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-                      real *theta, real *cos_theta )
+        real *theta, real *cos_theta )
 {
     (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-    if ( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if ( *cos_theta < -1. ) *cos_theta  = -1.0;
+    if ( *cos_theta > 1.0 )
+    {
+        *cos_theta  = 1.0;
+    }
+    if ( *cos_theta < -1.0 )
+    {
+        *cos_theta  = -1.0;
+    }
 
-    (*theta) = acos( *cos_theta );
+    (*theta) = ACOS( *cos_theta );
 }
 
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
 void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-                           rvec* dcos_theta_di,
-                           rvec* dcos_theta_dj,
-                           rvec* dcos_theta_dk )
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, rvec* dcos_theta_dk )
 {
     int t;
     real sqr_d_ji = SQR(d_ji);
     real sqr_d_jk = SQR(d_jk);
     real inv_dists = 1.0 / (d_ji * d_jk);
-    real inv_dists3 = pow( inv_dists, 3 );
+    real inv_dists3 = POW( inv_dists, 3.0 );
     real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 );
     real Cdot_inv3 = dot_dvecs * inv_dists3;
 
@@ -137,7 +142,7 @@ void Valence_Angles( reax_system *system, control_params *control,
             temp = SQR( bo_jt->BO );
             temp *= temp;
             temp *= temp;
-            prod_SBO *= exp( -temp );
+            prod_SBO *= EXP( -temp );
         }
 
         /* modifications to match Adri's code - 09/01/09 */
@@ -170,7 +175,7 @@ void Valence_Angles( reax_system *system, control_params *control,
         else
             SBO2 = 2, CSBO2 = 0;
 
-        expval6 = exp( p_val6 * workspace->Delta_boc[j] );
+        expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
 
         for ( pi = start_j; pi < end_j; ++pi )
         {
@@ -228,20 +233,21 @@ void Valence_Angles( reax_system *system, control_params *control,
                     p_ijk    = &( thb_intrs->select.three_body_list[num_thb_intrs] );
 
                     Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
-                                     pbond_jk->dvec, pbond_jk->d,
-                                     &theta, &cos_theta );
+                            pbond_jk->dvec, pbond_jk->d, &theta, &cos_theta );
 
                     Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
-                                          pbond_jk->dvec, pbond_jk->d,
-                                          &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
-                                          &(p_ijk->dcos_dk) );
+                            pbond_jk->dvec, pbond_jk->d,
+                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                            &(p_ijk->dcos_dk) );
                     p_ijk->thb = k;
                     p_ijk->pthb = pk;
                     p_ijk->theta = theta;
 
-                    sin_theta = sin( theta );
+                    sin_theta = SIN( theta );
                     if ( sin_theta < 1.0e-5 )
+                    {
                         sin_theta = 1.0e-5;
+                    }
 
                     ++num_thb_intrs;
 
@@ -280,7 +286,7 @@ void Valence_Angles( reax_system *system, control_params *control,
                             // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n",
                             //          i+1, j+1, k+1 );
 
-                            if ( fabs(thbh->prm[cnt].p_val1) > 0.001 )
+                            if ( FABS(thbh->prm[cnt].p_val1) > 0.001 )
                             {
                                 thbp = &( thbh->prm[cnt] );
 
@@ -291,15 +297,15 @@ void Valence_Angles( reax_system *system, control_params *control,
                                 p_val7 = thbp->p_val7;
                                 theta_00 = thbp->theta_00;
 
-                                exp3ij = exp( -p_val3 * pow( BOA_ij, p_val4 ) );
+                                exp3ij = EXP( -p_val3 * pow( BOA_ij, p_val4 ) );
                                 f7_ij = 1.0 - exp3ij;
                                 Cf7ij = p_val3 * p_val4 * pow( BOA_ij, p_val4 - 1.0 ) * exp3ij;
 
-                                exp3jk = exp( -p_val3 * pow( BOA_jk, p_val4 ) );
+                                exp3jk = EXP( -p_val3 * pow( BOA_jk, p_val4 ) );
                                 f7_jk = 1.0 - exp3jk;
                                 Cf7jk = p_val3 * p_val4 * pow( BOA_jk, p_val4 - 1.0 ) * exp3jk;
 
-                                expval7 = exp( -p_val7 * workspace->Delta_boc[j] );
+                                expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
                                 trm8 = 1.0 + expval6 + expval7;
                                 f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
                                 Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
@@ -307,10 +313,10 @@ void Valence_Angles( reax_system *system, control_params *control,
                                          (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ) );
 
                                 theta_0 = 180.0 - theta_00 * (1.0 -
-                                                              exp(-p_val10 * (2.0 - SBO2)));
+                                                              EXP(-p_val10 * (2.0 - SBO2)));
                                 theta_0 = DEG2RAD( theta_0 );
 
-                                expval2theta  = exp( -p_val2 * SQR(theta_0 - theta) );
+                                expval2theta  = EXP( -p_val2 * SQR(theta_0 - theta) );
                                 if ( p_val1 >= 0 )
                                     expval12theta = p_val1 * (1.0 - expval2theta);
                                 else // To avoid linear Me-H-Me angles (6/6/06)
@@ -323,7 +329,7 @@ void Valence_Angles( reax_system *system, control_params *control,
                                          expval2theta * (theta_0 - theta);
 
                                 Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
-                                           exp( -p_val10 * (2.0 - SBO2) );
+                                           EXP( -p_val10 * (2.0 - SBO2) );
 
                                 CEval5 = -CEval4 * Ctheta_0 * CSBO2;
                                 CEval6 = CEval5 * dSBO1;
@@ -341,10 +347,10 @@ void Valence_Angles( reax_system *system, control_params *control,
                                 p_pen3 = system->reax_param.gp.l[20];
                                 p_pen4 = system->reax_param.gp.l[21];
 
-                                exp_pen2ij = exp( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-                                exp_pen2jk = exp( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-                                exp_pen3 = exp( -p_pen3 * workspace->Delta[j] );
-                                exp_pen4 = exp(  p_pen4 * workspace->Delta[j] );
+                                exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                                exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                                exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                                exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
                                 trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
                                 f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
                                 Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 -
@@ -368,13 +374,13 @@ void Valence_Angles( reax_system *system, control_params *control,
                                 p_coa3 = system->reax_param.gp.l[38];
                                 p_coa4 = system->reax_param.gp.l[30];
 
-                                exp_coa2 = exp( p_coa2 * workspace->Delta_boc[j] );
+                                exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
                                 data->my_en.e_coa += e_coa =
                                                          p_coa1 / (1. + exp_coa2) *
-                                                         exp( -p_coa3 * SQR(workspace->total_bond_order[i] - BOA_ij) ) *
-                                                         exp( -p_coa3 * SQR(workspace->total_bond_order[k] - BOA_jk) ) *
-                                                         exp( -p_coa4 * SQR(BOA_ij - 1.5) ) *
-                                                         exp( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                                         EXP( -p_coa3 * SQR(workspace->total_bond_order[i] - BOA_ij) ) *
+                                                         EXP( -p_coa3 * SQR(workspace->total_bond_order[k] - BOA_jk) ) *
+                                                         EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                                                         EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
 
                                 CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
                                 CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
diff --git a/PG-PuReMD/src/valence_angles.h b/PG-PuReMD/src/valence_angles.h
index 3d788e5e2287e344c93c32c2cc5e1b56fd32cafa..1958b0cbb5784d29a2e0af9024f099bcb57d3bda 100644
--- a/PG-PuReMD/src/valence_angles.h
+++ b/PG-PuReMD/src/valence_angles.h
@@ -24,11 +24,21 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Valence_Angles( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
 
 void Calculate_Theta( rvec, real, rvec, real, real*, real* );
 
 void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/validation.h b/PG-PuReMD/src/validation.h
deleted file mode 100644
index 67c32b94e93ec51bbae5e13b9d6f61967270fe71..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/validation.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-#ifndef __VALIDATION_H__
-#define __VALIDATION_H__
-
-#include "reax_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int validate_neighbors (reax_system *, reax_list **lists);
-int validate_sym_dbond_indices (reax_system *system, storage *workspace, reax_list **lists);
-
-int validate_bonds (reax_system *, storage *, reax_list **);
-int validate_hbonds (reax_system *, storage *, reax_list **);
-int validate_sparse_matrix (reax_system *, storage *);
-
-int validate_grid (reax_system *);
-int validate_workspace (reax_system *, storage *);
-
-int validate_data (reax_system *, simulation_data *);
-int validate_three_bodies (reax_system *, storage *, reax_list **);
-int validate_atoms (reax_system *, reax_list **);
-
-int print_sparse_matrix (sparse_matrix *H);
-int print_sparse_matrix_host (sparse_matrix *H);
-
-int print_host_rvec2 (rvec2 *, int);
-int print_device_rvec2 (rvec2 *, int);
-
-int print_host_array (real *, int);
-int print_device_array (real *, int);
-
-void compare_rvec2( rvec2 *host, rvec2 *device, int N, char *msg);
-void compare_array (real *host, real *device, int N, char *msg);
-
-int     check_zeros_host (rvec2 *host, int n, char *);
-int     check_zeros_device (rvec2 *device, int n, char *);
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/vector.c b/PG-PuReMD/src/vector.c
deleted file mode 100644
index 8e01dffcfb689a41eb2d546e8e41abb8c3e9407e..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/vector.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "vector.h"
-#include "random.h"
-
-#if defined(SUDHIR)
-
-#ifdef __cplusplus
-extern "C"  {
-#endif
-
-
-inline int Vector_isZero( real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        if ( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
-
-    return 1;
-}
-
-
-inline void Vector_MakeZero( real *v, int k )
-{
-    for ( --k; k >= 0; --k )
-        v[k] = 0;
-}
-
-
-inline void Vector_Copy( real* dest, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] = v[k];
-}
-
-
-inline void Vector_Scale( real* dest, real c, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k];
-}
-
-
-inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k] + d * y[k];
-}
-
-
-inline void Vector_Add( real* dest, real c, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] += c * v[k];
-}
-
-
-inline real Dot( real* v1, real* v2, int k )
-{
-    real ret = 0;
-
-    for ( --k; k >= 0; --k )
-        ret +=  v1[k] * v2[k];
-
-    return ret;
-}
-
-
-inline real Norm( real* v1, int k )
-{
-    real ret = 0;
-
-    for ( --k; k >= 0; --k )
-        ret +=  SQR( v1[k] );
-
-    return SQRT( ret );
-}
-
-
-inline void Vector_Print( FILE *fout, char *vname, real *v, int k )
-{
-    int i;
-
-    fprintf( fout, "%s:", vname );
-    for ( i = 0; i < k; ++i )
-        fprintf( fout, "%24.15e\n", v[i] );
-    fprintf( fout, "\n" );
-}
-
-
-void rvec_Copy( rvec dest, rvec src )
-{
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-}
-
-inline void rvec_Scale( rvec ret, real c, rvec v )
-{
-    ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
-}
-
-
-inline void rvec_Add( rvec ret, rvec v )
-{
-    ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
-}
-
-
-inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
-{
-    ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
-}
-
-
-inline void rvec_Sum( rvec ret, rvec v1 , rvec v2 )
-{
-    ret[0] = v1[0] + v2[0];
-    ret[1] = v1[1] + v2[1];
-    ret[2] = v1[2] + v2[2];
-}
-
-
-inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
-{
-    ret[0] = c1 * v1[0] + c2 * v2[0];
-    ret[1] = c1 * v1[1] + c2 * v2[1];
-    ret[2] = c1 * v1[2] + c2 * v2[2];
-}
-
-
-inline real rvec_Dot( rvec v1, rvec v2 )
-{
-    return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
-}
-
-
-inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
-{
-    return (c1 * c2) * (v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]);
-}
-
-
-inline void rvec_Multiply( rvec r, rvec v1, rvec v2 )
-{
-    r[0] = v1[0] * v2[0];
-    r[1] = v1[1] * v2[1];
-    r[2] = v1[2] * v2[2];
-}
-
-
-inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
-{
-    r[0] = v1[0] * v2[0];
-    r[1] = v1[1] * v2[1];
-    r[2] = v1[2] * v2[2];
-}
-
-
-inline void rvec_Divide( rvec r, rvec v1, rvec v2 )
-{
-    r[0] = v1[0] / v2[0];
-    r[1] = v1[1] / v2[1];
-    r[2] = v1[2] / v2[2];
-}
-
-
-inline void rvec_iDivide( rvec r, rvec v1, ivec v2 )
-{
-    r[0] = v1[0] / v2[0];
-    r[1] = v1[1] / v2[1];
-    r[2] = v1[2] / v2[2];
-}
-
-
-inline void rvec_Invert( rvec r, rvec v )
-{
-    r[0] = 1. / v[0];
-    r[1] = 1. / v[1];
-    r[2] = 1. / v[2];
-}
-
-
-inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
-{
-    ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
-    ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
-    ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
-}
-
-
-inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            r[i][j] = v1[i] * v2[j];
-}
-
-
-inline real rvec_Norm_Sqr( rvec v )
-{
-    return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
-}
-
-
-inline real rvec_Norm( rvec v )
-{
-    return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
-}
-
-
-inline int rvec_isZero( rvec v )
-{
-    if ( fabs(v[0]) > ALMOST_ZERO ||
-            fabs(v[1]) > ALMOST_ZERO ||
-            fabs(v[2]) > ALMOST_ZERO )
-        return 0;
-    return 1;
-}
-
-inline void rvec_MakeZero( rvec v )
-{
-//  v[0] = v[1] = v[2] = 0.0000000000000;
-    v[0] = v[1] = v[2] = 0.000000000000000e+00;
-}
-
-
-#if defined(PURE_REAX)
-inline void rvec_Random( rvec v )
-{
-    v[0] = Random(2.0) - 1.0;
-    v[1] = Random(2.0) - 1.0;
-    v[2] = Random(2.0) - 1.0;
-}
-#endif
-
-
-inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
-{
-    int i, j, k;
-    rtensor temp;
-
-    // check if the result matrix is the same as one of m1, m2.
-    // if so, we cannot modify the contents of m1 or m2, so
-    // we have to use a temp matrix.
-    if ( ret == m1 || ret == m2 )
-    {
-        for ( i = 0; i < 3; ++i )
-            for ( j = 0; j < 3; ++j )
-            {
-                temp[i][j] = 0;
-                for ( k = 0; k < 3; ++k )
-                    temp[i][j] += m1[i][k] * m2[k][j];
-            }
-
-        for ( i = 0; i < 3; ++i )
-            for ( j = 0; j < 3; ++j )
-                ret[i][j] = temp[i][j];
-    }
-    else
-    {
-        for ( i = 0; i < 3; ++i )
-            for ( j = 0; j < 3; ++j )
-            {
-                ret[i][j] = 0;
-                for ( k = 0; k < 3; ++k )
-                    ret[i][j] += m1[i][k] * m2[k][j];
-            }
-    }
-}
-
-
-inline void rtensor_MatVec( rvec ret, rtensor m, rvec v )
-{
-    int i;
-    rvec temp;
-
-    // if ret is the same vector as v, we cannot modify the
-    // contents of v until all computation is finished.
-    if ( ret == v )
-    {
-        for ( i = 0; i < 3; ++i )
-            temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-
-        for ( i = 0; i < 3; ++i )
-            ret[i] = temp[i];
-    }
-    else
-    {
-        for ( i = 0; i < 3; ++i )
-            ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-    }
-}
-
-
-inline void rtensor_Scale( rtensor ret, real c, rtensor m )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] = c * m[i][j];
-}
-
-
-inline void rtensor_Add( rtensor ret, rtensor t )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] += t[i][j];
-}
-
-
-inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] += c * t[i][j];
-}
-
-
-inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] = t1[i][j] + t2[i][j];
-}
-
-
-inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1,
-                               real c2, rtensor t2 )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
-}
-
-
-inline void rtensor_Copy( rtensor ret, rtensor t )
-{
-    int i, j;
-
-    for ( i = 0; i < 3; ++i )
-        for ( j = 0; j < 3; ++j )
-            ret[i][j] = t[i][j];
-}
-
-
-inline void rtensor_Identity( rtensor t )
-{
-    t[0][0] = t[1][1] = t[2][2] = 1;
-    t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0;
-}
-
-
-inline void rtensor_MakeZero( rtensor t )
-{
-    t[0][0] = t[0][1] = t[0][2] = 0;
-    t[1][0] = t[1][1] = t[1][2] = 0;
-    t[2][0] = t[2][1] = t[2][2] = 0;
-}
-
-
-inline void rtensor_Transpose( rtensor ret, rtensor t )
-{
-    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
-}
-
-
-inline real rtensor_Det( rtensor t )
-{
-    return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
-}
-
-
-inline real rtensor_Trace( rtensor t )
-{
-    return (t[0][0] + t[1][1] + t[2][2]);
-}
-
-
-inline void Print_rTensor(FILE* fp, rtensor t)
-{
-    int i, j;
-
-    for (i = 0; i < 3; i++)
-    {
-        fprintf(fp, "[");
-        for (j = 0; j < 3; j++)
-            fprintf(fp, "%8.3f,\t", t[i][j]);
-        fprintf(fp, "]\n");
-    }
-}
-
-
-inline void ivec_MakeZero( ivec v )
-{
-// LGJ  v[0] = v[1] = v[2] = 0;
-    v[0] = v[1] = v[2] = 0.000000000000000e+00;
-}
-
-
-inline void ivec_Copy( ivec dest, ivec src )
-{
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-}
-
-
-inline void ivec_Scale( ivec dest, real C, ivec src )
-{
-    dest[0] = (int)(C * src[0]);
-    dest[1] = (int)(C * src[1]);
-    dest[2] = (int)(C * src[2]);
-}
-
-
-inline void ivec_rScale( ivec dest, real C, rvec src )
-{
-    dest[0] = (int)(C * src[0]);
-    dest[1] = (int)(C * src[1]);
-    dest[2] = (int)(C * src[2]);
-}
-
-
-inline int ivec_isZero( ivec v )
-{
-    if ( v[0] == 0 && v[1] == 0 && v[2] == 0 )
-        return 1;
-    return 0;
-}
-
-
-inline int ivec_isEqual( ivec v1, ivec v2 )
-{
-    if ( v1[0] == v2[0] && v1[1] == v2[1] && v1[2] == v2[2] )
-        return 1;
-    return 0;
-}
-
-
-inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
-{
-    dest[0] = v1[0] + v2[0];
-    dest[1] = v1[1] + v2[1];
-    dest[2] = v1[2] + v2[2];
-}
-
-
-inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 )
-{
-    dest[0] = k1 * v1[0] + k2 * v2[0];
-    dest[1] = k1 * v1[1] + k2 * v2[1];
-    dest[2] = k1 * v1[2] + k2 * v2[2];
-}
-
-
-inline void ivec_Add( ivec dest, ivec v )
-{
-    dest[0] += v[0];
-    dest[1] += v[1];
-    dest[2] += v[2];
-}
-
-
-inline void ivec_ScaledAdd( ivec dest, int k, ivec v )
-{
-    dest[0] += k * v[0];
-    dest[1] += k * v[1];
-    dest[2] += k * v[2];
-}
-
-
-
-inline void ivec_Max( ivec res, ivec v1, ivec v2 )
-{
-    res[0] = MAX( v1[0], v2[0] );
-    res[1] = MAX( v1[1], v2[1] );
-    res[2] = MAX( v1[2], v2[2] );
-}
-
-
-inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 )
-{
-    res[0] = MAX3( v1[0], v2[0], v3[0] );
-    res[1] = MAX3( v1[1], v2[1], v3[1] );
-    res[2] = MAX3( v1[2], v2[2], v3[2] );
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/vector.cu b/PG-PuReMD/src/vector.cu
deleted file mode 100644
index 489477f2ed2481c5a9003efe02c721a213b60e7c..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/vector.cu
+++ /dev/null
@@ -1,525 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "vector.h"
-#include "random.h"
-
-#if defined(SUDHIR)
-
-#ifdef __cplusplus
-extern "C"  {   
-#endif
-
-
-    inline int Vector_isZero( real* v, int k )
-    {
-        for( --k; k>=0; --k )
-            if( fabs( v[k] ) > ALMOST_ZERO )
-                return 0;
-
-        return 1;
-    }
-
-
-    inline void Vector_MakeZero( real *v, int k )
-    {
-        for( --k; k>=0; --k )
-            v[k] = 0;
-    }
-
-
-    inline void Vector_Copy( real* dest, real* v, int k )
-    {
-        for( --k; k>=0; --k )
-            dest[k] = v[k];
-    }
-
-
-    inline void Vector_Scale( real* dest, real c, real* v, int k )
-    {
-        for( --k; k>=0; --k )
-            dest[k] = c * v[k];
-    }
-
-
-    inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
-    {
-        for( --k; k>=0; --k )
-            dest[k] = c * v[k] + d * y[k];
-    }
-
-
-    inline void Vector_Add( real* dest, real c, real* v, int k )
-    {
-        for( --k; k>=0; --k )
-            dest[k] += c * v[k];
-    }
-
-
-    inline real Dot( real* v1, real* v2, int k )
-    {
-        real ret = 0;
-
-        for( --k; k>=0; --k )
-            ret +=  v1[k] * v2[k];
-
-        return ret;
-    }
-
-
-    inline real Norm( real* v1, int k )
-    {
-        real ret = 0;
-
-        for( --k; k>=0; --k )
-            ret +=  SQR( v1[k] );
-
-        return SQRT( ret );
-    }
-
-
-    inline void Vector_Print( FILE *fout, char *vname, real *v, int k )
-    {
-        int i;
-
-        fprintf( fout, "%s:", vname );
-        for( i = 0; i < k; ++i )
-            fprintf( fout, "%24.15e\n", v[i] );
-        fprintf( fout, "\n" );
-    }
-
-
-    void rvec_Copy( rvec dest, rvec src )
-    {
-        dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-    }
-
-    inline void rvec_Scale( rvec ret, real c, rvec v )
-    {
-        ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
-    }
-
-
-    inline void rvec_Add( rvec ret, rvec v )
-    {
-        ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
-    }
-
-
-    inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
-    {
-        ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
-    }
-
-
-    inline void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
-    {
-        ret[0] = v1[0] + v2[0];
-        ret[1] = v1[1] + v2[1];
-        ret[2] = v1[2] + v2[2];
-    }
-
-
-    inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 ,real c2, rvec v2 )
-    {
-        ret[0] = c1 * v1[0] + c2 * v2[0]; 
-        ret[1] = c1 * v1[1] + c2 * v2[1];
-        ret[2] = c1 * v1[2] + c2 * v2[2];
-    }
-
-
-    inline real rvec_Dot( rvec v1, rvec v2 )
-    {
-        return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
-    }
-
-
-    inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
-    {
-        return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
-    }
-
-
-    inline void rvec_Multiply( rvec r, rvec v1, rvec v2 )
-    {
-        r[0] = v1[0] * v2[0];
-        r[1] = v1[1] * v2[1];
-        r[2] = v1[2] * v2[2];
-    }
-
-
-    inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
-    {
-        r[0] = v1[0] * v2[0];
-        r[1] = v1[1] * v2[1];
-        r[2] = v1[2] * v2[2];
-    }
-
-
-    inline void rvec_Divide( rvec r, rvec v1, rvec v2 )
-    {
-        r[0] = v1[0] / v2[0];
-        r[1] = v1[1] / v2[1];
-        r[2] = v1[2] / v2[2];
-    }
-
-
-    inline void rvec_iDivide( rvec r, rvec v1, ivec v2 )
-    {
-        r[0] = v1[0] / v2[0];
-        r[1] = v1[1] / v2[1];
-        r[2] = v1[2] / v2[2];
-    }
-
-
-    inline void rvec_Invert( rvec r, rvec v )
-    {
-        r[0] = 1. / v[0];
-        r[1] = 1. / v[1];
-        r[2] = 1. / v[2];
-    }
-
-
-    inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
-    {
-        ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
-        ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
-        ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
-    }
-
-
-    inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                r[i][j] = v1[i] * v2[j];
-    }
-
-
-    inline real rvec_Norm_Sqr( rvec v )
-    {
-        return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
-    }
-
-
-    inline real rvec_Norm( rvec v )
-    {
-        return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
-    }
-
-
-    inline int rvec_isZero( rvec v )
-    {
-        if( fabs(v[0]) > ALMOST_ZERO || 
-                fabs(v[1]) > ALMOST_ZERO || 
-                fabs(v[2]) > ALMOST_ZERO )
-            return 0;
-        return 1;
-    }
-
-    inline void rvec_MakeZero( rvec v )
-    {
-        //  v[0] = v[1] = v[2] = 0.0000000000000;
-        v[0] = v[1] = v[2] = 0.000000000000000e+00;
-    }
-
-
-#if defined(PURE_REAX)
-    inline void rvec_Random( rvec v )
-    {
-        v[0] = Random(2.0)-1.0;
-        v[1] = Random(2.0)-1.0;
-        v[2] = Random(2.0)-1.0;
-    }
-#endif
-
-
-    inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
-    {
-        int i, j, k;
-        rtensor temp;
-
-        // check if the result matrix is the same as one of m1, m2.
-        // if so, we cannot modify the contents of m1 or m2, so 
-        // we have to use a temp matrix.
-        if( ret == m1 || ret == m2 )
-        {
-            for( i = 0; i < 3; ++i )
-                for( j = 0; j < 3; ++j )
-                {
-                    temp[i][j] = 0;        
-                    for( k = 0; k < 3; ++k )
-                        temp[i][j] += m1[i][k] * m2[k][j];
-                }
-
-            for( i = 0; i < 3; ++i )
-                for( j = 0; j < 3; ++j )
-                    ret[i][j] = temp[i][j];    
-        }
-        else
-        {
-            for( i = 0; i < 3; ++i )
-                for( j = 0; j < 3; ++j )
-                {
-                    ret[i][j] = 0;        
-                    for( k = 0; k < 3; ++k )
-                        ret[i][j] += m1[i][k] * m2[k][j];
-                }
-        }
-    }
-
-
-    inline void rtensor_MatVec( rvec ret, rtensor m, rvec v )
-    {
-        int i;
-        rvec temp;
-
-        // if ret is the same vector as v, we cannot modify the 
-        // contents of v until all computation is finished.
-        if( ret == v )
-        {
-            for( i = 0; i < 3; ++i )
-                temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-
-            for( i = 0; i < 3; ++i )
-                ret[i] = temp[i];
-        }
-        else
-        {
-            for( i = 0; i < 3; ++i )
-                ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-        }
-    }
-
-
-    inline void rtensor_Scale( rtensor ret, real c, rtensor m )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = c * m[i][j];
-    }
-
-
-    inline void rtensor_Add( rtensor ret, rtensor t )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] += t[i][j];
-    }
-
-
-    inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] += c * t[i][j];
-    }
-
-
-    inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = t1[i][j] + t2[i][j];
-    }
-
-
-    inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
-            real c2, rtensor t2 )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
-    }
-
-
-    inline void rtensor_Copy( rtensor ret, rtensor t )
-    {
-        int i, j;
-
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = t[i][j];
-    }
-
-
-    inline void rtensor_Identity( rtensor t )
-    {
-        t[0][0] = t[1][1] = t[2][2] = 1;
-        t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0;
-    }
-
-
-    inline void rtensor_MakeZero( rtensor t )
-    {
-        t[0][0] = t[0][1] = t[0][2] = 0;
-        t[1][0] = t[1][1] = t[1][2] = 0;
-        t[2][0] = t[2][1] = t[2][2] = 0;
-    }
-
-
-    inline void rtensor_Transpose( rtensor ret, rtensor t )
-    {
-        ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-        ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-        ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-        ret[2][0] = t[0][2], ret[2][1] = t[1][2];
-    }
-
-
-    inline real rtensor_Det( rtensor t )
-    {
-        return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-                t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-                t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
-    }
-
-
-    inline real rtensor_Trace( rtensor t )
-    {
-        return (t[0][0] + t[1][1] + t[2][2]);
-    }
-
-
-    inline void Print_rTensor(FILE* fp, rtensor t)
-    {
-        int i, j;
-
-        for (i=0; i < 3; i++)
-        {
-            fprintf(fp,"[");
-            for (j=0; j < 3; j++)
-                fprintf(fp,"%8.3f,\t",t[i][j]);
-            fprintf(fp,"]\n");
-        }
-    }
-
-
-    inline void ivec_MakeZero( ivec v )
-    {
-        // LGJ  v[0] = v[1] = v[2] = 0;
-        v[0] = v[1] = v[2] = 0.000000000000000e+00;
-    }
-
-
-    inline void ivec_Copy( ivec dest, ivec src )
-    {
-        dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-    }
-
-
-    inline void ivec_Scale( ivec dest, real C, ivec src )
-    {
-        dest[0] = (int)(C * src[0]);
-        dest[1] = (int)(C * src[1]);
-        dest[2] = (int)(C * src[2]);
-    }
-
-
-    inline void ivec_rScale( ivec dest, real C, rvec src )
-    {
-        dest[0] = (int)(C * src[0]);
-        dest[1] = (int)(C * src[1]);
-        dest[2] = (int)(C * src[2]);
-    }
-
-
-    inline int ivec_isZero( ivec v )
-    {
-        if( v[0]==0 && v[1]==0 && v[2]==0 )
-            return 1;
-        return 0;
-    }
-
-
-    inline int ivec_isEqual( ivec v1, ivec v2 )
-    {
-        if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
-            return 1;
-        return 0;
-    }
-
-
-    inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
-    {
-        dest[0] = v1[0] + v2[0];
-        dest[1] = v1[1] + v2[1];
-        dest[2] = v1[2] + v2[2];
-    }
-
-
-    inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 )
-    {
-        dest[0] = k1*v1[0] + k2*v2[0];
-        dest[1] = k1*v1[1] + k2*v2[1];
-        dest[2] = k1*v1[2] + k2*v2[2];
-    }
-
-
-    inline void ivec_Add( ivec dest, ivec v )
-    {
-        dest[0] += v[0];
-        dest[1] += v[1];
-        dest[2] += v[2];
-    }
-
-
-    inline void ivec_ScaledAdd( ivec dest, int k, ivec v )
-    {
-        dest[0] += k * v[0];
-        dest[1] += k * v[1];
-        dest[2] += k * v[2];
-    }
-
-
-
-    inline void ivec_Max( ivec res, ivec v1, ivec v2 )
-    {
-        res[0] = MAX( v1[0], v2[0] );
-        res[1] = MAX( v1[1], v2[1] );
-        res[2] = MAX( v1[2], v2[2] );
-    }
-
-
-    inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 )
-    {
-        res[0] = MAX3( v1[0], v2[0], v3[0] );
-        res[1] = MAX3( v1[1], v2[1], v3[1] );
-        res[2] = MAX3( v1[2], v2[2], v3[2] );
-    }
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h
index 539f5db36ab88b764567e1455217eab6fef40297..1cfd7950df50ac663a583bfc874d73b9d133ed42 100644
--- a/PG-PuReMD/src/vector.h
+++ b/PG-PuReMD/src/vector.h
@@ -23,127 +23,81 @@
 #define __VECTOR_H_
 
 #include "reax_types.h"
+
 #include "random.h"
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
-CUDA_HOST_DEVICE static inline int  Vector_isZero( real*, int );
-CUDA_HOST_DEVICE static inline void Vector_MakeZero( real*, int );
-CUDA_HOST_DEVICE static inline void Vector_Copy( real*, real*, int );
-CUDA_HOST_DEVICE static inline void Vector_Scale( real*, real, real*, int );
-CUDA_HOST_DEVICE static inline void Vector_Sum( real*, real, real*, real, real*, int );
-CUDA_HOST_DEVICE static inline void Vector_Add( real*, real, real*, int );
-CUDA_HOST_DEVICE static inline real Dot( real*, real*, int );
-CUDA_HOST_DEVICE static inline real Norm( real*, int );
-CUDA_HOST_DEVICE static inline void Vector_Print( FILE*, char*, real*, int );
-
-CUDA_HOST_DEVICE static inline void rvec_Copy( rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Scale( rvec, real, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Add( rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_ScaledAdd( rvec, real, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Sum( rvec, rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_ScaledSum( rvec, real, rvec, real, rvec );
-CUDA_HOST_DEVICE static inline real rvec_Dot( rvec, rvec );
-CUDA_HOST_DEVICE static inline real rvec_ScaledDot( real, rvec, real, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Multiply( rvec, rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_iMultiply( rvec, ivec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Divide( rvec, rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_iDivide( rvec, rvec, ivec );
-CUDA_HOST_DEVICE static inline void rvec_Invert( rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_Cross( rvec, rvec, rvec );
-CUDA_HOST_DEVICE static inline void rvec_OuterProduct( rtensor, rvec, rvec );
-CUDA_HOST_DEVICE static inline real rvec_Norm_Sqr( rvec );
-CUDA_HOST_DEVICE static inline real rvec_Norm( rvec );
-CUDA_HOST_DEVICE static inline int  rvec_isZero( rvec );
-CUDA_HOST_DEVICE static inline void rvec_MakeZero( rvec );
-CUDA_HOST_DEVICE static inline void rvec_Random( rvec );
-
-CUDA_HOST_DEVICE static inline void rtensor_MakeZero( rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Multiply( rtensor, rtensor, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_MatVec( rvec, rtensor, rvec );
-CUDA_HOST_DEVICE static inline void rtensor_Scale( rtensor, real, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Add( rtensor, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_ScaledAdd( rtensor, real, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Sum( rtensor, rtensor, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Scale( rtensor, real, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Copy( rtensor, rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Identity( rtensor );
-CUDA_HOST_DEVICE static inline void rtensor_Transpose( rtensor, rtensor );
-CUDA_HOST_DEVICE static inline real rtensor_Det( rtensor );
-CUDA_HOST_DEVICE static inline real rtensor_Trace( rtensor );
-
-CUDA_HOST_DEVICE static inline void Print_rTensor(FILE*, rtensor);
-
-CUDA_HOST_DEVICE static inline int  ivec_isZero( ivec );
-CUDA_HOST_DEVICE static inline int  ivec_isEqual( ivec, ivec );
-CUDA_HOST_DEVICE static inline void ivec_MakeZero( ivec );
-CUDA_HOST_DEVICE static inline void ivec_Copy( ivec, ivec );
-CUDA_HOST_DEVICE static inline void ivec_Scale( ivec, real, ivec );
-CUDA_HOST_DEVICE static inline void ivec_rScale( ivec, real, rvec );
-CUDA_HOST_DEVICE static inline void ivec_Sum( ivec, ivec, ivec );
-CUDA_HOST_DEVICE static inline void ivec_ScaledSum( ivec, int, ivec, int, ivec );
-CUDA_HOST_DEVICE static inline void ivec_Add( ivec, ivec );
-CUDA_HOST_DEVICE static inline void ivec_ScaledAdd( ivec, int, ivec );
-CUDA_HOST_DEVICE static inline void ivec_Max( ivec, ivec, ivec );
-CUDA_HOST_DEVICE static inline void ivec_Max3( ivec, ivec, ivec, ivec );
-
-
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
 CUDA_HOST_DEVICE static inline int Vector_isZero( real* v, int k )
 {
     for ( --k; k >= 0; --k )
-        if ( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
+    {
+        if ( FABS( v[k] ) > ALMOST_ZERO )
+        {
+            return FALSE;
+        }
+    }
 
-    return 1;
+    return TRUE;
 }
 
 
 CUDA_HOST_DEVICE static inline void Vector_MakeZero( real *v, int k )
 {
     for ( --k; k >= 0; --k )
+    {
         v[k] = 0;
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline void Vector_Copy( real* dest, real* v, int k )
 {
     for ( --k; k >= 0; --k )
+    {
         dest[k] = v[k];
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline void Vector_Scale( real* dest, real c, real* v, int k )
 {
     for ( --k; k >= 0; --k )
+    {
         dest[k] = c * v[k];
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
 {
     for ( --k; k >= 0; --k )
+    {
         dest[k] = c * v[k] + d * y[k];
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline void Vector_Add( real* dest, real c, real* v, int k )
 {
     for ( --k; k >= 0; --k )
+    {
         dest[k] += c * v[k];
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline real Dot( real* v1, real* v2, int k )
 {
-    real ret = 0;
+    real ret = 0.0;
 
     for ( --k; k >= 0; --k )
+    {
         ret +=  v1[k] * v2[k];
+    }
 
     return ret;
 }
@@ -151,10 +105,12 @@ CUDA_HOST_DEVICE static inline real Dot( real* v1, real* v2, int k )
 
 CUDA_HOST_DEVICE static inline real Norm( real* v1, int k )
 {
-    real ret = 0;
+    real ret = 0.0;
 
     for ( --k; k >= 0; --k )
+    {
         ret +=  SQR( v1[k] );
+    }
 
     return SQRT( ret );
 }
@@ -166,31 +122,42 @@ CUDA_HOST_DEVICE static inline void Vector_Print( FILE *fout, char *vname, real
 
     fprintf( fout, "%s:", vname );
     for ( i = 0; i < k; ++i )
+    {
         fprintf( fout, "%8.3f\n", v[i] );
+    }
     fprintf( fout, "\n" );
 }
 
 
 CUDA_HOST_DEVICE static inline void rvec_Copy( rvec dest, rvec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
+
 CUDA_HOST_DEVICE static inline void rvec_Scale( rvec ret, real c, rvec v )
 {
-    ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
+    ret[0] = c * v[0];
+    ret[1] = c * v[1];
+    ret[2] = c * v[2];
 }
 
 
 CUDA_HOST_DEVICE static inline void rvec_Add( rvec ret, rvec v )
 {
-    ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
+    ret[0] += v[0];
+    ret[1] += v[1];
+    ret[2] += v[2];
 }
 
 
 CUDA_HOST_DEVICE static inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
 {
-    ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
+    ret[0] += c * v[0];
+    ret[1] += c * v[1];
+    ret[2] += c * v[2];
 }
 
 
@@ -256,9 +223,9 @@ CUDA_HOST_DEVICE static inline void rvec_iDivide( rvec r, rvec v1, ivec v2 )
 
 CUDA_HOST_DEVICE static inline void rvec_Invert( rvec r, rvec v )
 {
-    r[0] = 1. / v[0];
-    r[1] = 1. / v[1];
-    r[2] = 1. / v[2];
+    r[0] = 1.0 / v[0];
+    r[1] = 1.0 / v[1];
+    r[2] = 1.0 / v[2];
 }
 
 
@@ -275,8 +242,12 @@ CUDA_HOST_DEVICE static inline void rvec_OuterProduct( rtensor r, rvec v1, rvec
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             r[i][j] = v1[i] * v2[j];
+        }
+    }
 }
 
 
@@ -294,26 +265,31 @@ CUDA_HOST_DEVICE static inline real rvec_Norm( rvec v )
 
 CUDA_HOST_DEVICE static inline int rvec_isZero( rvec v )
 {
-    if ( fabs(v[0]) > ALMOST_ZERO ||
-            fabs(v[1]) > ALMOST_ZERO ||
-            fabs(v[2]) > ALMOST_ZERO )
-        return 0;
-    return 1;
+    if ( FABS(v[0]) > ALMOST_ZERO ||
+            FABS(v[1]) > ALMOST_ZERO ||
+            FABS(v[2]) > ALMOST_ZERO )
+    {
+        return FALSE;
+    }
+
+    return TRUE;
 }
 
 
 CUDA_HOST_DEVICE static inline void rvec_MakeZero( rvec v )
 {
-    v[0] = v[1] = v[2] = 0.0000000000000;
+    v[0] = 0.0000000000000;
+    v[1] = 0.0000000000000;
+    v[2] = 0.0000000000000;
 }
 
 
 #if defined(PURE_REAX)
-CUDA_HOST_DEVICE static inline void rvec_Random( rvec v )
+static inline void rvec_Random( rvec v )
 {
-    v[0] = Random(2.0) - 1.0;
-    v[1] = Random(2.0) - 1.0;
-    v[2] = Random(2.0) - 1.0;
+    v[0] = Random( 2.0 ) - 1.0;
+    v[1] = Random( 2.0 ) - 1.0;
+    v[2] = Random( 2.0 ) - 1.0;
 }
 #endif
 
@@ -329,26 +305,36 @@ CUDA_HOST_DEVICE static inline void rtensor_Multiply( rtensor ret, rtensor m1, r
     if ( ret == m1 || ret == m2 )
     {
         for ( i = 0; i < 3; ++i )
+        {
             for ( j = 0; j < 3; ++j )
             {
                 temp[i][j] = 0;
                 for ( k = 0; k < 3; ++k )
                     temp[i][j] += m1[i][k] * m2[k][j];
             }
+        }
 
         for ( i = 0; i < 3; ++i )
+        {
             for ( j = 0; j < 3; ++j )
+            {
                 ret[i][j] = temp[i][j];
+            }
+        }
     }
     else
     {
         for ( i = 0; i < 3; ++i )
+        {
             for ( j = 0; j < 3; ++j )
             {
                 ret[i][j] = 0;
                 for ( k = 0; k < 3; ++k )
+                {
                     ret[i][j] += m1[i][k] * m2[k][j];
+                }
             }
+        }
     }
 }
 
@@ -363,15 +349,21 @@ CUDA_HOST_DEVICE static inline void rtensor_MatVec( rvec ret, rtensor m, rvec v
     if ( ret == v )
     {
         for ( i = 0; i < 3; ++i )
+        {
             temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
 
         for ( i = 0; i < 3; ++i )
+        {
             ret[i] = temp[i];
+        }
     }
     else
     {
         for ( i = 0; i < 3; ++i )
+        {
             ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
     }
 }
 
@@ -381,8 +373,12 @@ CUDA_HOST_DEVICE static inline void rtensor_Scale( rtensor ret, real c, rtensor
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c * m[i][j];
+        }
+    }
 }
 
 
@@ -391,8 +387,12 @@ CUDA_HOST_DEVICE static inline void rtensor_Add( rtensor ret, rtensor t )
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += t[i][j];
+        }
+    }
 }
 
 
@@ -401,8 +401,12 @@ CUDA_HOST_DEVICE static inline void rtensor_ScaledAdd( rtensor ret, real c, rten
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += c * t[i][j];
+        }
+    }
 }
 
 
@@ -411,8 +415,12 @@ CUDA_HOST_DEVICE static inline void rtensor_Sum( rtensor ret, rtensor t1, rtenso
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t1[i][j] + t2[i][j];
+        }
+    }
 }
 
 
@@ -422,8 +430,12 @@ CUDA_HOST_DEVICE static inline void rtensor_ScaledSum( rtensor ret, real c1, rte
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
+        }
+    }
 }
 
 
@@ -432,40 +444,62 @@ CUDA_HOST_DEVICE static inline void rtensor_Copy( rtensor ret, rtensor t )
     int i, j;
 
     for ( i = 0; i < 3; ++i )
+    {
         for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t[i][j];
+        }
+    }
 }
 
 
 CUDA_HOST_DEVICE static inline void rtensor_Identity( rtensor t )
 {
-    t[0][0] = t[1][1] = t[2][2] = 1;
-    t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0;
+    t[0][0] = 1.0;
+    t[1][1] = 1.0;
+    t[2][2] = 1.0;
+    t[0][1] = 0.0;
+    t[0][2] = 0.0;
+    t[1][0] = 0.0;
+    t[1][2] = 0.0;
+    t[2][0] = 0.0;
+    t[2][1] = 0.0;
 }
 
 
 CUDA_HOST_DEVICE static inline void rtensor_MakeZero( rtensor t )
 {
-    t[0][0] = t[0][1] = t[0][2] = 0;
-    t[1][0] = t[1][1] = t[1][2] = 0;
-    t[2][0] = t[2][1] = t[2][2] = 0;
+    t[0][0] = 0.0;
+    t[0][1] = 0.0;
+    t[0][2] = 0.0;
+    t[1][0] = 0.0;
+    t[1][1] = 0.0;
+    t[1][2] = 0.0;
+    t[2][0] = 0.0;
+    t[2][1] = 0.0;
+    t[2][2] = 0.0;
 }
 
 
 CUDA_HOST_DEVICE static inline void rtensor_Transpose( rtensor ret, rtensor t )
 {
-    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    ret[0][0] = t[0][0];
+    ret[1][1] = t[1][1];
+    ret[2][2] = t[2][2];
+    ret[0][1] = t[1][0];
+    ret[0][2] = t[2][0];
+    ret[1][0] = t[0][1];
+    ret[1][2] = t[2][1];
+    ret[2][0] = t[0][2];
+    ret[2][1] = t[1][2];
 }
 
 
 CUDA_HOST_DEVICE static inline real rtensor_Det( rtensor t )
 {
     return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }
 
 
@@ -483,7 +517,9 @@ CUDA_HOST_DEVICE static inline void Print_rTensor(FILE* fp, rtensor t)
     {
         fprintf(fp, "[");
         for (j = 0; j < 3; j++)
+        {
             fprintf(fp, "%8.3f,\t", t[i][j]);
+        }
         fprintf(fp, "]\n");
     }
 }
@@ -491,13 +527,17 @@ CUDA_HOST_DEVICE static inline void Print_rTensor(FILE* fp, rtensor t)
 
 CUDA_HOST_DEVICE static inline void ivec_MakeZero( ivec v )
 {
-    v[0] = v[1] = v[2] = 0;
+    v[0] = 0;
+    v[1] = 0;
+    v[2] = 0;
 }
 
 
 CUDA_HOST_DEVICE static inline void ivec_Copy( ivec dest, ivec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
@@ -520,16 +560,22 @@ CUDA_HOST_DEVICE static inline void ivec_rScale( ivec dest, real C, rvec src )
 CUDA_HOST_DEVICE static inline int ivec_isZero( ivec v )
 {
     if ( v[0] == 0 && v[1] == 0 && v[2] == 0 )
-        return 1;
-    return 0;
+    {
+        return TRUE;
+    }
+
+    return FALSE;
 }
 
 
 CUDA_HOST_DEVICE static inline int ivec_isEqual( ivec v1, ivec v2 )
 {
     if ( v1[0] == v2[0] && v1[1] == v2[1] && v1[2] == v2[2] )
-        return 1;
-    return 0;
+    {
+        return TRUE;
+    }
+
+    return FALSE;
 }
 
 
diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index 3ab7bdba493aa8a5d56c64a100239eeff99d4563..f57bed636093bf0d5a822a4c77d5861524705e26 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -1,7 +1,9 @@
 ACLOCAL_AMFLAGS = -I ../m4
 
+if USE_CUDA
 SUFFIXES = .cu
 include ../cuda.am
+endif
 
 AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing -m64
 AM_CPPFLAGS =
@@ -18,21 +20,35 @@ NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict
 #NVCCFLAGS += -Xcompiler -fPIC -dc
 #NVCCFLAGS += --ptxas-options -v
 
-bin_PROGRAMS = bin/puremd-gpu
-bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
-	src/restart.c src/param.c src/pdb_tools.c src/box.c \
-	src/lin_alg.c src/QEq.c src/allocate.c src/bond_orders.c \
+bin_PROGRAMS = bin/spuremd
+bin_spuremd_SOURCES = src/analyze.c src/print_utils.c \
+	src/restart.c src/tool_box.c src/control.c src/ffield.c \
+	src/geo_tools.c src/box.c \
+	src/lin_alg.c src/qeq.c src/allocate.c src/bond_orders.c \
 	src/forces.c src/four_body_interactions.c \
 	src/grid.c src/init_md.c src/integrate.c src/list.c \
 	src/lookup.c src/neighbors.c \
 	src/reset_utils.c src/single_body_interactions.c \
 	src/system_props.c src/three_body_interactions.c \
 	src/traj.c src/two_body_interactions.c src/vector.c \
-	src/testmd.c \
-	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \
-	src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \
+	src/testmd.c
+include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
+        src/restart.h src/tool_box.c src/control.h src/ffield.c \
+	src/geo_tools.h src/box.h \
+	src/lin_alg.h src/qeq.h src/allocate.h src/bond_orders.h \
+	src/forces.h src/four_body_interactions.h \
+	src/grid.h src/init_md.h src/integrate.h src/list.h \
+	src/lookup.h src/neighbors.h \
+	src/reset_utils.h src/single_body_interactions.h \
+	src/system_props.h src/three_body_interactions.h \
+	src/traj.h src/two_body_interactions.h src/vector.h
+
+if USE_CUDA
+bin_spuremd_SOURCES += src/cuda_utils.cu src/cuda_copy.cu \
+	src/cuda_init.cu src/cuda_reduction.cu \
+	src/cuda_center_mass.cu src/cuda_box.cu src/cuda_validation.cu \
         src/cuda_allocate.cu src/cuda_bond_orders.cu \
-	src/cuda_lin_alg.cu src/cuda_QEq.cu \
+	src/cuda_lin_alg.cu src/cuda_qeq.cu \
         src/cuda_forces.cu src/cuda_four_body_interactions.cu \
 	src/cuda_grid.cu src/cuda_init_md.cu src/cuda_integrate.cu src/cuda_list.cu \
 	src/cuda_lookup.cu src/cuda_neighbors.cu \
@@ -40,19 +56,11 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
         src/cuda_system_props.cu src/cuda_three_body_interactions.cu \
 	src/cuda_two_body_interactions.cu src/cuda_environment.cu \
 	src/cuda_post_evolve.cu
-include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
-        src/restart.h src/param.h src/pdb_tools.h src/box.h \
-	src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \
-	src/forces.h src/four_body_interactions.h \
-	src/grid.h src/init_md.h src/integrate.h src/list.h \
-	src/lookup.h src/neighbors.h \
-	src/reset_utils.h src/single_body_interactions.h \
-	src/system_props.h src/three_body_interactions.h \
-	src/traj.h src/two_body_interactions.h src/vector.h \
-	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/cuda_reduction.h \
-	src/cuda_center_mass.h src/cuda_box.h src/validation.h \
+include_HEADERS += src/cuda_utils.h src/cuda_copy.h \
+	src/cuda_init.h src/cuda_reduction.h \
+	src/cuda_center_mass.h src/cuda_box.h src/cuda_validation.h \
         src/cuda_allocate.h src/cuda_bond_orders.h \
-	src/cuda_lin_alg.h src/cuda_QEq.h \
+	src/cuda_lin_alg.h src/cuda_qeq.h \
         src/cuda_forces.h src/cuda_four_body_interactions.h \
 	src/cuda_grid.h src/cuda_init_md.h src/cuda_integrate.h src/cuda_list.h \
 	src/cuda_lookup.h src/cuda_neighbors.h \
@@ -62,8 +70,16 @@ include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
 	src/cuda_post_evolve.h
 
 # dummy source to cause C linking
-nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c
+nodist_EXTRA_bin_spuremd_SOURCES = src/dummy.c
+
+endif
+
+
+bin_spuremd_CFLAGS = $(AM_CFLAGS) $(CFLAGS)
+bin_spuremd_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
+bin_spuremd_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS)
 
-bin_puremd_gpu_CFLAGS = $(AM_CFLAGS) $(CFLAGS) $(CUDA_CFLAGS)
-bin_puremd_gpu_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
-bin_puremd_gpu_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS) $(CUDA_LIBS)
+if USE_CUDA
+bin_spuremd_CFLAGS += $(CUDA_CFLAGS)
+bin_spuremd_LDFLAGS += $(CUDA_LIBS)
+endif
diff --git a/PuReMD-GPU/aclocal.m4 b/PuReMD-GPU/aclocal.m4
index 2e1d098d2159d3a3069bc44cc5e0942cb9e86070..d6bf5baa543b24a4c3d8f9fc06ee2020a1d3f9bf 100644
--- a/PuReMD-GPU/aclocal.m4
+++ b/PuReMD-GPU/aclocal.m4
@@ -1150,4 +1150,5 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
+m4_include([../m4/ax_compiler_vendor.m4])
 m4_include([../m4/ax_cuda.m4])
diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac
index 38c7cf737e44056c612a3b48f55708486ab78af4..1fb73efde614b91be4f3bbb7a780a0ebba961821 100644
--- a/PuReMD-GPU/configure.ac
+++ b/PuReMD-GPU/configure.ac
@@ -53,49 +53,139 @@ AC_CHECK_TYPES([gzFile], [],
 # Checks for library functions.
 AC_FUNC_MALLOC
 AC_FUNC_STRTOD
-AC_CHECK_FUNCS([memset pow sqrt])
+AC_CHECK_FUNCS([gettimeofday memset pow sqrt])
+
+# Check for compiler vendor
+AX_COMPILER_VENDOR
+if test "x$ax_cv_c_compiler_vendor" = "xgnu"; then
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing"
+	else
+		CFLAGS="$CFLAGS -Wall"
+	fi
+fi
+if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -fast"
+	fi
+fi
 
-# Check for CUDA support.
-CONFIGURE_HEADLINE([ CUDA support ])
-AX_CUDA
-NVCCFLAGS=
-if test "BUILD_DEBUG" = "true"
+# Check for OpenMP support.
+if test "x$BUILD_OPENMP" = "xyes"; then
+	AC_OPENMP
+	if test "x${OPENMP_CFLAGS}" = "x"; then
+		AC_MSG_WARN([
+	  -----------------------------------------------
+	   Unable to find OpenMP support on this system.
+	   Building a single-threaded version.
+	  -----------------------------------------------])
+	else
+		# bug due to recent Intel compiler change (?)
+		if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
+			OPENMP_CFLAGS="-qopenmp"
+		fi
+		AC_SUBST(CFLAGS, "$OPENMP_CFLAGS")
+		AC_SUBST(CPPFLAGS, "$OPENMP_CFLAGS")
+	fi
+fi
+
+if test "x$BUILD_SUPERLU_MT" != "x"
 then
-	NVCCFLAGS+=" -g -G"
+	CPPFLAGS="${CPPFLAGS} -I${BUILD_SUPERLU_MT}/include"
+	LDFLAGS="${LDFLAGS} -L${BUILD_SUPERLU_MT}/lib"
+	#TODO: implement better BLAS detection
+	LIBS="${LIBS} -lblas"
+#	BLAS_FOUND_LIBS="yes"
+#	AC_SEARCH_LIBS([dtrsv_], [blas blas_OPENMP],
+#		        [], [BLAS_FOUND_LIBS="no"], [])
+#	AS_IF([test "x${BLAS_FOUND_LIBS}" != "xyes"],
+#	      [AC_MSG_ERROR([Unable to find BLAS library.])])
+	AC_CHECK_HEADERS([slu_mt_ddefs.h], [SUPERLU_MT_FOUND_HEADERS="yes"])
+	AS_IF([test "x${SUPERLU_MT_FOUND_HEADERS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT headers.])])
+	SUPERLU_MT_FOUND_LIBS="yes"
+	#TODO: fix issue where multiple -l flags added, one for each call below
+	AC_SEARCH_LIBS([intMalloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([get_perm_c], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([pdgstrf_init], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pdgstrf], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pxgstrf_finalize], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([StatAlloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatInit], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatFree], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_SuperNode_SCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_CompCol_NCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AS_IF([test "x${SUPERLU_MT_FOUND_LIBS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT library.])])
+	AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.])
 fi
-AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
-
-AC_CHECK_LIB([cuda], [cuGetErrorString])
-AC_CHECK_LIB([cudart], [cudaMalloc])
-AC_CHECK_LIB([cublas], [cublasDnrm2])
-AC_CHECK_LIB([cusparse], [cusparseCreateMatDescr])
-#AC_SEARCH_LIBS([cublasDaxpy], [cublas])
-#AC_SEARCH_LIBS([cublasDscal], [cublas])
-#AC_SEARCH_LIBS([cublasDdot], [cublas])
-#AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
-#AC_SEARCH_LIBS([cudaGetLastError], [cudart])
-#AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
-#AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
-#AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
+
+# Check for CUDA support.
+if test "x$BUILD_GPU" = "xyes"
+then
+	CONFIGURE_HEADLINE([ CUDA support ])
+	AX_CUDA
+	NVCCFLAGS=
+	if test "BUILD_DEBUG" = "true"
+	then
+		NVCCFLAGS+=" -g -G"
+	fi
+	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
+
+	AC_CHECK_LIB([cuda], [cuGetErrorString])
+	AC_CHECK_LIB([cudart], [cudaMalloc])
+	AC_CHECK_LIB([cublas], [cublasDnrm2])
+	AC_CHECK_LIB([cusparse], [cusparseCreateMatDescr])
+#	AC_SEARCH_LIBS([cublasDaxpy], [cublas])
+#	AC_SEARCH_LIBS([cublasDscal], [cublas])
+#	AC_SEARCH_LIBS([cublasDdot], [cublas])
+#	AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
+#	AC_SEARCH_LIBS([cudaGetLastError], [cudart])
+#	AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
+#	AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
+#	AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
 #
-#AC_SEARCH_LIBS([cublasDnrm2], [cublas],
-#	[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
-#AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
-#	[AC_MSG_ERROR([Unable to find CUBLAS library.])])
+#	AC_SEARCH_LIBS([cublasDnrm2], [cublas],
+#		[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
+#	AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
+#		[AC_MSG_ERROR([Unable to find CUBLAS library.])])
 #
-#AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
-#	[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
-#AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
-#	[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
+#	AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
+#		[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
+#	AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
+#		[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
 #
-#AC_CHECK_TYPES([cublasHandle_t], [], 
-#	       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
-#AC_CHECK_TYPES([cusparseHandle_t], [], 
-#	       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#AC_CHECK_TYPES([cusparseMatDescr_t], [], 
-#	       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-
-if test "BUILD_PROF" = "true"
+#	AC_CHECK_TYPES([cublasHandle_t], [], 
+#		       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
+#	AC_CHECK_TYPES([cusparseHandle_t], [], 
+#		       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+#	AC_CHECK_TYPES([cusparseMatDescr_t], [], 
+#		       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+else
+	AM_CONDITIONAL(USE_CUDA, test "x" = "xyes")
+fi
+
+if test "x$BUILD_DEBUG" = "xyes"
+then
+	CFLAGS="${CFLAGS} ${DEBUG_FLAGS}"
+fi
+
+if test "x$BUILD_GPROF" = "xyes"
+then
+	CFLAGS="${CFLAGS} ${GPROF_FLAGS}"
+fi
+
+if test "x$BUILD_PROF" = "xtrue"
 then
 	NVCCFLAGS+=" --compiler-options ${gprof_flags}"
 fi
diff --git a/PuReMD-GPU/src/QEq.c b/PuReMD-GPU/src/QEq.c
deleted file mode 100644
index 8cc638ea90dcc25f86d33f275b162c8e531d82bb..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/QEq.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "QEq.h"
-
-#include "allocate.h"
-#include "lin_alg.h"
-#include "list.h"
-#include "print_utils.h"
-#include "index_utils.h"
-#include "system_props.h"
-
-#include "sort.h"
-
-
-int compare_matrix_entry(const void *v1, const void *v2)
-{
-    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
-}
-
-
-void Sort_Matrix_Rows( sparse_matrix *A )
-{
-    int i, si, ei;
-
-    for( i = 0; i < A->n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1];
-        qsort( &(A->entries[si]), ei - si, 
-                sizeof(sparse_matrix_entry), compare_matrix_entry );
-    }
-}
-
-
-void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
-{
-    int i, j, k;
-    real val;
-
-    /* init droptol to 0 */
-    for( i = 0; i < A->n; ++i )
-        droptol[i] = 0;
-
-    /* calculate sqaure of the norm of each row */
-    for( i = 0; i < A->n; ++i ) {
-        for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
-            j = A->entries[k].j;
-            val = A->entries[k].val;
-
-            droptol[i] += val*val;
-            droptol[j] += val*val;
-        }
-
-        val = A->entries[k].val; // diagonal entry
-        droptol[i] += val*val;
-    }
-
-    /* calculate local droptol for each row */
-    //fprintf( stderr, "droptol: " );
-    for( i = 0; i < A->n; ++i ) {
-        //fprintf( stderr, "%f-->", droptol[i] );
-        droptol[i] = SQRT( droptol[i] ) * dtol;
-        //fprintf( stderr, "%f  ", droptol[i] );
-    }
-    //fprintf( stderr, "\n" );
-}
-
-
-int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
-{
-    int i, j, pj;
-    int fillin;
-    real val;
-
-    fillin = 0;
-
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i )
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-
-            if( fabs(val) > droptol[i] )
-                ++fillin;
-        }
-
-    return fillin + A->n;
-}
-
-
-void ICHOLT( sparse_matrix *A, real *droptol, 
-        sparse_matrix *L, sparse_matrix *U )
-{
-    sparse_matrix_entry tmp[1000];
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val;
-    int *Utop;
-
-    Utop = (int*) malloc((A->n+1) * sizeof(int));
-
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    for( i = 0; i <= A->n; ++i )
-        L->start[i] = U->start[i] = 0;
-
-    for( i = 0; i < A->n; ++i )
-        Utop[i] = 0;
-
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i ){
-        L->start[i] = Ltop;
-        tmptop = 0;
-
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-
-            if( fabs(val) > droptol[i] ){
-                k1 = 0;
-                k2 = L->start[j];
-                while( k1 < tmptop && k2 < L->start[j+1] ){
-                    if( tmp[k1].j < L->entries[k2].j )
-                        ++k1;
-                    else if( tmp[k1].j > L->entries[k2].j )
-                        ++k2;
-                    else
-                        val -= (tmp[k1++].val * L->entries[k2++].val);
-                }
-
-                // L matrix is lower triangular, 
-                // so right before the start of next row comes jth diagonal
-                val /= L->entries[L->start[j+1]-1].val;
-
-                tmp[tmptop].j = j;
-                tmp[tmptop].val = val;
-                ++tmptop;
-            }
-            //fprintf( stderr, " -- done\n" );
-        }
-
-        // compute the ith diagonal in L
-        // sanity check
-        if( A->entries[pj].j != i ) {
-            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-            exit(999);
-        }
-
-        val = A->entries[pj].val;
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            val -= (tmp[k1].val * tmp[k1].val);
-
-        tmp[tmptop].j = i;
-        tmp[tmptop].val = SQRT(val);
-
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-                L->entries[Ltop].j = tmp[k1].j;
-                L->entries[Ltop].val = tmp[k1].val;
-                U->start[tmp[k1].j+1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        // keep the diagonal in any case
-        L->entries[Ltop].j = tmp[k1].j;
-        L->entries[Ltop].val = tmp[k1].val;
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-
-    L->start[i] = Ltop;
-    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-    for( i = 1; i <= U->n; ++i )
-        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-
-    for( i = 0; i < L->n; ++i )
-        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-            j = L->entries[pj].j;
-            U->entries[Utop[j]].j = i;
-            U->entries[Utop[j]].val = L->entries[pj].val;
-            Utop[j]++;
-        }
-
-    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-}
-
-
-void Init_MatVec( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list *far_nbrs )
-{
-    int i, fillin;
-    real s_tmp, t_tmp;
-    //char fname[100];
-
-    if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL))
-    {
-        //Print_Linear_System( system, control, workspace, data->step );
-        Sort_Matrix_Rows( &workspace->H );
-
-        //fprintf( stderr, "H matrix sorted\n" );
-
-        Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
-        //fprintf( stderr, "drop tolerances calculated\n" );
-
-        if( workspace->L.entries == NULL )
-        {
-            fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
-
-#ifdef __DEBUG_CUDA__
-            fprintf( stderr, "fillin = %d\n", fillin );
-#endif
-
-            if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
-                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
-            {
-                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-                exit(INSUFFICIENT_SPACE);
-            }
-
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                    fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
-#endif
-        }
-
-        ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "icholt-" );
-        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-        //Print_Sparse_Matrix2( workspace->L, fname );
-        //Print_Sparse_Matrix( U );
-#endif
-    }
-
-    /* extrapolation for s & t */
-    for( i = 0; i < system->N; ++i ) {
-        // no extrapolation
-        //s_tmp = workspace->s[0][i];
-        //t_tmp = workspace->t[0][i];
-
-        // linear
-        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-
-        // quadratic
-        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-        t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] + 3*(workspace->t[index_wkspace_sys(0,i,system->N)]-workspace->t[index_wkspace_sys(1,i,system->N)]);
-
-        // cubic
-        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) - 
-            (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)] );
-        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
-
-        // 4th order
-        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-
-        workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)];
-        workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; 
-        workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)];
-        workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)];
-        workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp;
-
-        workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)];
-        workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; 
-        workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)];
-        workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)];
-        workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp;
-    }
-}
-
-
-void Calculate_Charges( reax_system *system, static_storage *workspace )
-{
-    int i;
-    real u, s_sum, t_sum;
-
-    s_sum = t_sum = 0.;
-    for( i = 0; i < system->N; ++i ) {
-        s_sum += workspace->s[index_wkspace_sys(0,i,system->N)];
-        t_sum += workspace->t[index_wkspace_sys(0,i,system->N)];
-    }
-
-    u = s_sum / t_sum;
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
-#endif
-
-    for( i = 0; i < system->N; ++i )
-    {
-        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)] - u * workspace->t[index_wkspace_sys(0,i,system->N)];
-    }
-}
-
-
-void QEq( reax_system *system, control_params *control, simulation_data *data, 
-        static_storage *workspace, list *far_nbrs, 
-        output_controls *out_control )
-{
-    int matvecs;
-
-    //real t_start, t_elapsed;
-
-    //t_start = Get_Time ();
-    Init_MatVec( system, control, data, workspace, far_nbrs );
-    //t_elapsed = Get_Timing_Info ( t_start );
-
-    //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
-
-    //if( data->step % 10 == 0 )
-    //  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
-
-    //t_start = Get_Time ( );
-    matvecs = GMRES( workspace, &workspace->H, 
-            workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
-    matvecs += GMRES( workspace, &workspace->H, 
-            workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
-    //t_elapsed = Get_Timing_Info ( t_start );
-
-    //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
-
-    //matvecs = GMRES_HouseHolder( workspace, workspace->H, 
-    //    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
-    //matvecs += GMRES_HouseHolder( workspace, workspace->H,  
-    //    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
-
-    //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, system );
-    //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, system );
-
-    //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
-    //      workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-    ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
-    //     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
-
-    //matvecs = CG( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += CG( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-
-    //matvecs = SDM( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += SDM( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-
-    //fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
-
-    data->timing.matvecs += matvecs;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-#endif
-
-    Calculate_Charges( system, workspace );
-    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
-    //   data->step, 
-    //   workspace->s[0][0], workspace->t[0][0], 
-    //   workspace->s[0][1], workspace->t[0][1], 
-    //   workspace->s[0][2], workspace->t[0][2] );
-    // if( data->step == control->nsteps )
-    //Print_Charges( system, control, workspace, data->step );
-}
diff --git a/PuReMD-GPU/src/QEq.h b/PuReMD-GPU/src/QEq.h
deleted file mode 100644
index 31dfbf61ba05ec79d32313c3ab648eb259f183f2..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/QEq.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __QEq_H_
-#define __QEq_H_
-
-#include "mytypes.h"
-
-
-void QEq( reax_system*, control_params*, simulation_data*, static_storage*,
-        list*, output_controls* );
-
-
-static inline HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
-{
-    sparse_matrix_entry temp = array[index1];
-    array[index1] = array[index2];
-    array[index2] = temp;
-}
-
-
-static inline HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
-{
-    int i = start;
-    int k = end; 
-
-    if (end - start >= 1)  
-    {  
-        int pivot = array[start].j;
-
-        while (k > i) 
-        {  
-            while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
-            while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
-            if (k > i) swap(array, i, k);
-        }  
-        swap(array, start, k);
-        quick_sort(array, start, k - 1);
-        quick_sort(array, k + 1, end);
-    }  
-}
-
-
-#endif
diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c
index 65f0eb2a872673259d508f17fc0da43530a7426f..44c761ccc4640ac26d710f3f4f4f4c6787dd1015 100644
--- a/PuReMD-GPU/src/allocate.c
+++ b/PuReMD-GPU/src/allocate.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,103 +22,148 @@
 #include "allocate.h"
 
 #include "list.h"
+#include "tool_box.h"
+
+/* allocate space for atoms */
+int PreAllocate_Space( reax_system *system, control_params *control,
+        static_storage *workspace )
+{
+    int i;
+
+    system->atoms = (reax_atom*) scalloc( system->N,
+            sizeof(reax_atom), "atoms" );
+    workspace->orig_id = (int*) scalloc( system->N,
+            sizeof(int), "orid_id" );
+
+    /* space for keeping restriction info, if any */
+    if ( control->restrict_bonds )
+    {
+        workspace->restricted = (int*) scalloc( system->N,
+                sizeof(int), "restricted_atoms" );
+
+        workspace->restricted_list = (int*) scalloc( system->N,
+                sizeof(int), "restricted_list" );
+
+        workspace->restricted_list = (int*) scalloc( MAX_RESTRICT * system->N,
+                sizeof(int), "restricted_list[i]" );
+    }
+
+    return SUCCESS;
+}
 
 
 void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
     Delete_List( far_nbrs );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
+
+    if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
     {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-            num_intrs, far_nbrs->num_intrs );  
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+             num_intrs, far_nbrs->num_intrs );
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
+             num_intrs * sizeof(far_neighbor_data) / (1024 * 1024) );
 #endif
 }
 
 
-HOST int Allocate_Matrix( sparse_matrix *H, int n, int m )
+int Allocate_Matrix( sparse_matrix **pH, int n, int m )
 {
+    sparse_matrix *H;
+
+    if ( (*pH = (sparse_matrix*) malloc( sizeof(sparse_matrix)) ) == NULL )
+    {
+        return FAILURE;
+    }
+
+    H = *pH;
     H->n = n;
     H->m = m;
-    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-
-    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
 
-    if( (H->entries = 
-                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
-        return 0;
+    if ( (H->start = (unsigned int*) malloc(sizeof(int) * (n + 1))) == NULL
+            || (H->j = (unsigned int*) malloc(sizeof(int) * m)) == NULL
+            || (H->val = (real*) malloc(sizeof(real) * m)) == NULL )
+    {
+        return FAILURE;
+    }
 
-    return 1;
+    return SUCCESS;
 }
 
 
 void Deallocate_Matrix( sparse_matrix *H )
 {
     free(H->start);
-    free(H->entries);
-    free(H->end);
+    free(H->j);
+    free(H->val);
+    free(H);
 }
 
 
-int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
+int Reallocate_Matrix( sparse_matrix **H, int n, int m, char *name )
 {
-    Deallocate_Matrix( H );
-    if( !Allocate_Matrix( H, n, m ) ) {
+    Deallocate_Matrix( *H );
+
+    if ( Allocate_Matrix( H, n, m ) == FAILURE )
+    {
         fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-        exit( 1 );
+        exit( INSUFFICIENT_MEMORY );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-            name, n, m );
-    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+             name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n",
+             name, m * sizeof(sparse_matrix_entry) / (1024 * 1024) );
 #endif
-    return 1;
+
+    return SUCCESS;
 }
 
 
-int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
-        list *hbonds )
+int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top,
+                         list *hbonds )
 {
     int i, num_hbonds;
 
     num_hbonds = 0;
     /* find starting indexes for each H and the total number of hbonds */
-    for( i = 1; i < n; ++i )
-        hb_top[i] += hb_top[i-1];
-    num_hbonds = hb_top[n-1];
+    for ( i = 1; i < n; ++i )
+    {
+        hb_top[i] += hb_top[i - 1];
+    }
+    num_hbonds = hb_top[n - 1];
 
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) )
+    if ( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) )
     {
         fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    for( i = 0; i < n; ++i )
-        if( h_index[i] == 0 ){
-            Set_Start_Index( 0, 0, hbonds ); 
-            Set_End_Index( 0, 0, hbonds ); 
+    for ( i = 0; i < n; ++i )
+    {
+        if ( h_index[i] == 0 )
+        {
+            Set_Start_Index( 0, 0, hbonds );
+            Set_End_Index( 0, 0, hbonds );
         }
-        else if( h_index[i] > 0 ){
-            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
-            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
+        else if ( h_index[i] > 0 )
+        {
+            Set_Start_Index( h_index[i], hb_top[i - 1], hbonds );
+            Set_End_Index( h_index[i], hb_top[i - 1], hbonds );
         }
+    }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
-    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-            num_hbonds * sizeof(hbond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: hbonds = %ldMB\n",
+             num_hbonds * sizeof(hbond_data) / (1024 * 1024) );
 #endif
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -129,10 +175,14 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating hbonds\n" );
 #endif
-    hb_top = (int *)calloc( n, sizeof(int) );
-    for( i = 0; i < n; ++i )
-        if( h_index[i] >= 0 )
-            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
+    hb_top = calloc( n, sizeof(int) );
+    for ( i = 0; i < n; ++i )
+    {
+        if ( h_index[i] >= 0 )
+        {
+            hb_top[i] = MAX(Num_Entries(h_index[i], hbonds) * SAFE_HBONDS, MIN_HBONDS);
+        }
+    }
 
     Delete_List( hbonds );
 
@@ -140,7 +190,7 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 
     free( hb_top );
 
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -150,29 +200,32 @@ int Allocate_Bond_List( int n, int *bond_top, list *bonds )
 
     num_bonds = 0;
     /* find starting indexes for each atom and the total number of bonds */
-    for( i = 1; i < n; ++i )
-        bond_top[i] += bond_top[i-1];
-    num_bonds = bond_top[n-1];
+    for ( i = 1; i < n; ++i )
+    {
+        bond_top[i] += bond_top[i - 1];
+    }
+    num_bonds = bond_top[n - 1];
 
-    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) )
+    if ( !Make_List(n, num_bonds, TYP_BOND, bonds ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    Set_Start_Index( 0, 0, bonds ); 
-    Set_End_Index( 0, 0, bonds ); 
-    for( i = 1; i < n; ++i ) {
-        Set_Start_Index( i, bond_top[i-1], bonds ); 
-        Set_End_Index( i, bond_top[i-1], bonds ); 
+    Set_Start_Index( 0, 0, bonds );
+    Set_End_Index( 0, 0, bonds );
+    for ( i = 1; i < n; ++i )
+    {
+        Set_Start_Index( i, bond_top[i - 1], bonds );
+        Set_End_Index( i, bond_top[i - 1], bonds );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n",
+             num_bonds * sizeof(bond_data) / (1024 * 1024) );
 #endif
-    return 1;
+    return SUCCESS;
 }
 
 
@@ -184,9 +237,10 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "reallocating bonds\n" );
 #endif
-    bond_top = (int *)calloc( n, sizeof(int) );
+    bond_top = calloc( n, sizeof(int) );
     *est_3body = 0;
-    for( i = 0; i < n; ++i ){
+    for ( i = 0; i < n; ++i )
+    {
         *est_3body += SQR( Num_Entries( i, bonds ) );
         bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
     }
@@ -194,17 +248,18 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
     Delete_List( bonds );
 
     Allocate_Bond_List( n, bond_top, bonds );
-    *num_bonds = bond_top[n-1];
+    *num_bonds = bond_top[n - 1];
 
     free( bond_top );
 
-    return 1;
+    return SUCCESS;
 }
 
 
-void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
-        int nbr_flag )
+void Reallocate( reax_system *system, static_storage *workspace, list **lists,
+                 int nbr_flag )
 {
+    int i, j, k;
     int num_bonds, est_3body;
     reallocate_data *realloc;
     grid *g;
@@ -212,70 +267,75 @@ void Reallocate( reax_system *system, static_storage *workspace, list **lists,
     realloc = &(workspace->realloc);
     g = &(system->g);
 
-    if( realloc->num_far > 0 && nbr_flag ) {
-        fprintf (stderr, " Reallocating neighbors \n");
-        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
-                system->N, realloc->num_far * SAFE_ZONE );
+    if ( realloc->num_far > 0 && nbr_flag )
+    {
+        Reallocate_Neighbor_List( (*lists) + FAR_NBRS,
+                                  system->N, realloc->num_far * SAFE_ZONE );
         realloc->num_far = -1;
     }
 
-    if( realloc->Htop > 0 ){
-        fprintf (stderr, " Reallocating Matrix \n");
-        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+    if ( realloc->Htop > 0 )
+    {
+        Reallocate_Matrix( &(workspace->H), system->N, realloc->Htop * SAFE_ZONE, "H" );
         realloc->Htop = -1;
 
-        Deallocate_Matrix( &workspace->L );
-        Deallocate_Matrix( &workspace->U );
+        Deallocate_Matrix( workspace->L );
+        Deallocate_Matrix( workspace->U );
+        workspace->L = NULL;
+        workspace->U = NULL;
     }
 
-    if( realloc->hbonds > 0 ){
-        fprintf (stderr, " Reallocating hbonds \n");
+    if ( realloc->hbonds > 0 )
+    {
         Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-                (*lists)+HBONDS );
+                               (*lists) + HBONDS );
         realloc->hbonds = -1;
     }
 
     num_bonds = est_3body = -1;
-    if( realloc->bonds > 0 ){
-        fprintf (stderr, " Reallocating bonds \n");
-        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
+    if ( realloc->bonds > 0 )
+    {
+        Reallocate_Bonds_List( system->N, (*lists) + BONDS, &num_bonds, &est_3body );
         realloc->bonds = -1;
         realloc->num_3body = MAX( realloc->num_3body, est_3body );
     }
 
-    if( realloc->num_3body > 0 ) {
-        fprintf (stderr, " Reallocating 3Body \n");
-        Delete_List( (*lists)+THREE_BODIES );
+    if ( realloc->num_3body > 0 )
+    {
+        Delete_List( (*lists) + THREE_BODIES );
 
-        if( num_bonds == -1 )
-            num_bonds = ((*lists)+BONDS)->num_intrs;
+        if ( num_bonds == -1 )
+            num_bonds = ((*lists) + BONDS)->num_intrs;
         realloc->num_3body *= SAFE_ZONE;
 
-        if( !Make_List( num_bonds, realloc->num_3body,
-                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) )
+        if ( !Make_List( num_bonds, realloc->num_3body,
+                         TYP_THREE_BODY, (*lists) + THREE_BODIES ) )
         {
             fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            exit( INIT_ERR );
+            exit( CANNOT_INITIALIZE );
         }
         realloc->num_3body = -1;
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "reallocating 3 bodies\n" );
         fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
         fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
-        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
-                realloc->num_3body*sizeof(three_body_interaction_data)/
-                (1024*1024) );
+        fprintf( stderr, "reallocated 3body memory: %ldMB\n",
+                 realloc->num_3body * sizeof(three_body_interaction_data) /
+                 (1024 * 1024) );
 #endif
     }
 
-    if( realloc->gcell_atoms > -1 ){
+    if ( realloc->gcell_atoms > -1 )
+    {
 #if defined(DEBUG_FOCUS)
         fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
 #endif
 
         free (g->atoms);
-        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-                sizeof (int) * workspace->realloc.gcell_atoms);
+        g->atoms = (int *) calloc( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof(int) * workspace->realloc.gcell_atoms );
+        realloc->gcell_atoms = -1;
+
         realloc->gcell_atoms = -1;
     }
 }
diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h
index b03ed80b34f153b9929ccaa80bc5c27fbf6ce540..fc77b5e50b83b3e999db2e7f3668f17893d7bbf1 100644
--- a/PuReMD-GPU/src/allocate.h
+++ b/PuReMD-GPU/src/allocate.h
@@ -28,9 +28,12 @@
 extern "C"  {
 #endif
 
+int PreAllocate_Space( reax_system*, control_params*, static_storage* );
+
 void Reallocate( reax_system*, static_storage*, list**, int );
 
-int Allocate_Matrix( sparse_matrix*, int, int );
+int Allocate_Matrix( sparse_matrix**, int, int );
+
 void Deallocate_Matrix( sparse_matrix *);
 
 int Allocate_HBond_List( int, int, int*, int*, list* );
diff --git a/PuReMD-GPU/src/analyze.c b/PuReMD-GPU/src/analyze.c
index 8eef938a372add29eb044846b12c77eddd47aad0..014eea8f46093e1a381e0a5e44241cfdc3d7b719 100644
--- a/PuReMD-GPU/src/analyze.c
+++ b/PuReMD-GPU/src/analyze.c
@@ -772,17 +772,16 @@ void Calculate_Drift( reax_system *system, control_params *control,
                                 &(system->box), driftvec );
 
             if ( fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 )
+                    fabs( driftvec[1] ) >= system->box.box_norms[1] / 2.0 - 2.0 ||
+                    fabs( driftvec[2] ) >= system->box.box_norms[2] / 2.0 - 2.0 )
             {
                 /* the atom has moved almost half the box size.
                    exclude it from further drift computations as it might have an
                    improper contribution due to periodic boudnaries. */
+                workspace->x_old[i][0] = -999999999.0;
+                workspace->x_old[i][1] = -999999999.0;
+                workspace->x_old[i][2] = -999999999.0;
 
-                //TODO -- check this one. may be not initializing this properly
-                //workspace->x_old[i][0] = workspace->x_old[i][2] = workspace->x_old[i][2] = -999999999999.0;
-                workspace->x_old[i][0] = workspace->x_old[i][2] = workspace->x_old[i][2] = -999999999.0;
-                //TODO -- check this one. may be not initializing this properly
                 continue;
             }
 
diff --git a/PuReMD-GPU/src/box.c b/PuReMD-GPU/src/box.c
index e42395c5556042493c0f707879772d84e9d18658..a7911fda49b1dccbbfccf24ae2fa445c608e0d5e 100644
--- a/PuReMD-GPU/src/box.c
+++ b/PuReMD-GPU/src/box.c
@@ -1,58 +1,189 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPu - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "box.h"
+
+#include "tool_box.h"
 #include "vector.h"
 
 
-void Init_Box_From_CRYST(real a, real b, real c, 
-        real alpha, real beta, real gamma, 
+void Make_Consistent( simulation_box* box )
+{
+    real one_vol;
+
+    box->volume =
+        box->box[0][0] * (box->box[1][1] * box->box[2][2] -
+                          box->box[2][1] * box->box[2][1]) +
+        box->box[0][1] * (box->box[2][0] * box->box[1][2] -
+                          box->box[1][0] * box->box[2][2]) +
+        box->box[0][2] * (box->box[1][0] * box->box[2][1] -
+                          box->box[2][0] * box->box[1][1]);
+
+    one_vol = 1.0 / box->volume;
+
+    box->box_inv[0][0] = (box->box[1][1] * box->box[2][2] -
+                          box->box[1][2] * box->box[2][1]) * one_vol;
+    box->box_inv[0][1] = (box->box[0][2] * box->box[2][1] -
+                          box->box[0][1] * box->box[2][2]) * one_vol;
+    box->box_inv[0][2] = (box->box[0][1] * box->box[1][2] -
+                          box->box[0][2] * box->box[1][1]) * one_vol;
+
+    box->box_inv[1][0] = (box->box[1][2] * box->box[2][0] -
+                          box->box[1][0] * box->box[2][2]) * one_vol;
+    box->box_inv[1][1] = (box->box[0][0] * box->box[2][2] -
+                          box->box[0][2] * box->box[2][0]) * one_vol;
+    box->box_inv[1][2] = (box->box[0][2] * box->box[1][0] -
+                          box->box[0][0] * box->box[1][2]) * one_vol;
+
+    box->box_inv[2][0] = (box->box[1][0] * box->box[2][1] -
+                          box->box[1][1] * box->box[2][0]) * one_vol;
+    box->box_inv[2][1] = (box->box[0][1] * box->box[2][0] -
+                          box->box[0][0] * box->box[2][1]) * one_vol;
+    box->box_inv[2][2] = (box->box[0][0] * box->box[1][1] -
+                          box->box[0][1] * box->box[1][0]) * one_vol;
+
+    box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
+                              SQR(box->box[0][1]) +
+                              SQR(box->box[0][2]) );
+    box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
+                              SQR(box->box[1][1]) +
+                              SQR(box->box[1][2]) );
+    box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
+                              SQR(box->box[2][1]) +
+                              SQR(box->box[2][2]) );
+
+    box->trans[0][0] = box->box[0][0] / box->box_norms[0];
+    box->trans[0][1] = box->box[1][0] / box->box_norms[0];
+    box->trans[0][2] = box->box[2][0] / box->box_norms[0];
+
+    box->trans[1][0] = box->box[0][1] / box->box_norms[1];
+    box->trans[1][1] = box->box[1][1] / box->box_norms[1];
+    box->trans[1][2] = box->box[2][1] / box->box_norms[1];
+
+    box->trans[2][0] = box->box[0][2] / box->box_norms[2];
+    box->trans[2][1] = box->box[1][2] / box->box_norms[2];
+    box->trans[2][2] = box->box[2][2] / box->box_norms[2];
+
+    one_vol = box->box_norms[0] * box->box_norms[1] * box->box_norms[2] * one_vol;
+
+    box->trans_inv[0][0] = (box->trans[1][1] * box->trans[2][2] -
+                            box->trans[1][2] * box->trans[2][1]) * one_vol;
+    box->trans_inv[0][1] = (box->trans[0][2] * box->trans[2][1] -
+                            box->trans[0][1] * box->trans[2][2]) * one_vol;
+    box->trans_inv[0][2] = (box->trans[0][1] * box->trans[1][2] -
+                            box->trans[0][2] * box->trans[1][1]) * one_vol;
+
+    box->trans_inv[1][0] = (box->trans[1][2] * box->trans[2][0] -
+                            box->trans[1][0] * box->trans[2][2]) * one_vol;
+    box->trans_inv[1][1] = (box->trans[0][0] * box->trans[2][2] -
+                            box->trans[0][2] * box->trans[2][0]) * one_vol;
+    box->trans_inv[1][2] = (box->trans[0][2] * box->trans[1][0] -
+                            box->trans[0][0] * box->trans[1][2]) * one_vol;
+
+    box->trans_inv[2][0] = (box->trans[1][0] * box->trans[2][1] -
+                            box->trans[1][1] * box->trans[2][0]) * one_vol;
+    box->trans_inv[2][1] = (box->trans[0][1] * box->trans[2][0] -
+                            box->trans[0][0] * box->trans[2][1]) * one_vol;
+    box->trans_inv[2][2] = (box->trans[0][0] * box->trans[1][1] -
+                            box->trans[0][1] * box->trans[1][0]) * one_vol;
+
+//   for (i=0; i < 3; i++)
+//     {
+//       for (j=0; j < 3; j++)
+//  fprintf(stderr,"%lf\t",box->trans[i][j]);
+//       fprintf(stderr,"\n");
+//     }
+//   fprintf(stderr,"\n");
+//   for (i=0; i < 3; i++)
+//     {
+//       for (j=0; j < 3; j++)
+//  fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
+//       fprintf(stderr,"\n");
+//     }
+
+    box->g[0][0] = box->box[0][0] * box->box[0][0] +
+                   box->box[0][1] * box->box[0][1] +
+                   box->box[0][2] * box->box[0][2];
+    box->g[1][0] =
+        box->g[0][1] = box->box[0][0] * box->box[1][0] +
+                       box->box[0][1] * box->box[1][1] +
+                       box->box[0][2] * box->box[1][2];
+    box->g[2][0] =
+        box->g[0][2] = box->box[0][0] * box->box[2][0] +
+                       box->box[0][1] * box->box[2][1] +
+                       box->box[0][2] * box->box[2][2];
+
+    box->g[1][1] = box->box[1][0] * box->box[1][0] +
+                   box->box[1][1] * box->box[1][1] +
+                   box->box[1][2] * box->box[1][2];
+    box->g[1][2] =
+        box->g[2][1] = box->box[1][0] * box->box[2][0] +
+                       box->box[1][1] * box->box[2][1] +
+                       box->box[1][2] * box->box[2][2];
+
+    box->g[2][2] = box->box[2][0] * box->box[2][0] +
+                   box->box[2][1] * box->box[2][1] +
+                   box->box[2][2] * box->box[2][2];
+
+    // These proportions are only used for isotropic_NPT!
+    box->side_prop[0] = box->box[0][0] / box->box[0][0];
+    box->side_prop[1] = box->box[1][1] / box->box[0][0];
+    box->side_prop[2] = box->box[2][2] / box->box[0][0];
+}
+
+
+/* setup the simulation box */
+void Setup_Box( real a, real b, real c, real alpha, real beta, real gamma,
         simulation_box* box )
 {
     double c_alpha, c_beta, c_gamma, s_gamma, zi;
 
-    c_alpha = cos(DEG2RAD(alpha));
-    c_beta  = cos(DEG2RAD(beta));
-    c_gamma = cos(DEG2RAD(gamma));
-    s_gamma = sin(DEG2RAD(gamma));
+    if ( IS_NAN_REAL(a) || IS_NAN_REAL(b) || IS_NAN_REAL(c)
+            || IS_NAN_REAL(alpha) || IS_NAN_REAL(beta) || IS_NAN_REAL(gamma) )
+    {
+        fprintf( stderr, "Invalid simulation box boundaries for big box (NaN). Terminating...\n" );
+        exit( INVALID_INPUT );
+    }
 
-    zi = (c_alpha - c_beta * c_gamma)/s_gamma; 
+    c_alpha = COS(DEG2RAD(alpha));
+    c_beta  = COS(DEG2RAD(beta));
+    c_gamma = COS(DEG2RAD(gamma));
+    s_gamma = SIN(DEG2RAD(gamma));
+    zi = (c_alpha - c_beta * c_gamma) / s_gamma;
 
-    box->box[0][0] = a; 
-    box->box[0][1] = 0.0; 
+    box->box[0][0] = a;
+    box->box[0][1] = 0.0;
     box->box[0][2] = 0.0;
-
-    box->box[1][0] = b * c_gamma; 
-    box->box[1][1] = b * s_gamma; 
-    box->box[1][2] = 0.0; 
-
+    box->box[1][0] = b * c_gamma;
+    box->box[1][1] = b * s_gamma;
+    box->box[1][2] = 0.0;
     box->box[2][0] = c * c_beta;
     box->box[2][1] = c * zi;
     box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi));
+#if defined(DEBUG)
+    fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n",
+             box->box[0][0], box->box[1][1], box->box[2][2] );
+#endif
 
     Make_Consistent( box );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", 
-            box->box[0][0], box->box[1][1], box->box[2][2] );
-#endif
 }
 
 
@@ -60,8 +191,8 @@ void Update_Box( rtensor box_tensor, simulation_box* box )
 {
     int i, j;
 
-    for (i=0; i < 3; i++)
-        for (j=0; j < 3; j++)
+    for (i = 0; i < 3; i++)
+        for (j = 0; j < 3; j++)
             box->box[i][j] = box_tensor[i][j];
 
     Make_Consistent( box );
@@ -70,200 +201,37 @@ void Update_Box( rtensor box_tensor, simulation_box* box )
 
 void Update_Box_Isotropic( simulation_box *box, real mu )
 {
-    /*box->box[0][0] = 
+    /*box->box[0][0] =
       POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-      box->box[1][1] = box->box[0][0] * box->side_prop[1];
-      box->box[2][2] = box->box[0][0] * box->side_prop[2]; 
-     */
+    box->box[1][1] = box->box[0][0] * box->side_prop[1];
+    box->box[2][2] = box->box[0][0] * box->side_prop[2];
+    */
     rtensor_Copy( box->old_box, box->box );
     box->box[0][0] *= mu;
     box->box[1][1] *= mu;
     box->box[2][2] *= mu;
 
-    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    box->volume = box->box[0][0] * box->box[1][1] * box->box[2][2];
     Make_Consistent(box/*, periodic*/);
 }
 
 
 void Update_Box_SemiIsotropic( simulation_box *box, rvec mu )
 {
-    /*box->box[0][0] = 
+    /*box->box[0][0] =
       POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-      box->box[1][1] = box->box[0][0] * box->side_prop[1];
-      box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
+    box->box[1][1] = box->box[0][0] * box->side_prop[1];
+    box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
     rtensor_Copy( box->old_box, box->box );
     box->box[0][0] *= mu[0];
     box->box[1][1] *= mu[1];
     box->box[2][2] *= mu[2];
 
-    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    box->volume = box->box[0][0] * box->box[1][1] * box->box[2][2];
     Make_Consistent(box);
 }
 
 
-void Make_Consistent(simulation_box* box)
-{
-    real one_vol;
-
-    box->volume = 
-        box->box[0][0] * (box->box[1][1]*box->box[2][2] - 
-                box->box[2][1]*box->box[2][1]) +
-        box->box[0][1] * (box->box[2][0]*box->box[1][2] -
-                box->box[1][0]*box->box[2][2]) +
-        box->box[0][2] * (box->box[1][0]*box->box[2][1] -
-                box->box[2][0]*box->box[1][1]);
-
-    one_vol = 1.0/box->volume;
-
-    box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] -
-            box->box[1][2]*box->box[2][1]) * one_vol;
-    box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] -
-            box->box[0][1]*box->box[2][2]) * one_vol;
-    box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] -
-            box->box[0][2]*box->box[1][1]) * one_vol;
-
-    box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] -
-            box->box[1][0]*box->box[2][2]) * one_vol;
-    box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] -
-            box->box[0][2]*box->box[2][0]) * one_vol;
-    box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] -
-            box->box[0][0]*box->box[1][2]) * one_vol;
-
-    box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] -
-            box->box[1][1]*box->box[2][0]) * one_vol;
-    box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] -
-            box->box[0][0]*box->box[2][1]) * one_vol;
-    box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] -
-            box->box[0][1]*box->box[1][0]) * one_vol;
-
-    box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
-            SQR(box->box[0][1]) +
-            SQR(box->box[0][2]) );
-    box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
-            SQR(box->box[1][1]) +
-            SQR(box->box[1][2]) );
-    box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
-            SQR(box->box[2][1]) +
-            SQR(box->box[2][2]) );
-
-    box->trans[0][0] = box->box[0][0]/box->box_norms[0]; 
-    box->trans[0][1] = box->box[1][0]/box->box_norms[0];
-    box->trans[0][2] = box->box[2][0]/box->box_norms[0];
-
-    box->trans[1][0] = box->box[0][1]/box->box_norms[1]; 
-    box->trans[1][1] = box->box[1][1]/box->box_norms[1];
-    box->trans[1][2] = box->box[2][1]/box->box_norms[1];
-
-    box->trans[2][0] = box->box[0][2]/box->box_norms[2]; 
-    box->trans[2][1] = box->box[1][2]/box->box_norms[2];
-    box->trans[2][2] = box->box[2][2]/box->box_norms[2];
-
-    one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol;
-
-    box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] -
-            box->trans[1][2]*box->trans[2][1]) * one_vol;
-    box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] -
-            box->trans[0][1]*box->trans[2][2]) * one_vol;
-    box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] -
-            box->trans[0][2]*box->trans[1][1]) * one_vol;
-
-    box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] -
-            box->trans[1][0]*box->trans[2][2]) * one_vol;
-    box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] -
-            box->trans[0][2]*box->trans[2][0]) * one_vol;
-    box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] -
-            box->trans[0][0]*box->trans[1][2]) * one_vol;
-
-    box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] -
-            box->trans[1][1]*box->trans[2][0]) * one_vol;
-    box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] -
-            box->trans[0][0]*box->trans[2][1]) * one_vol;
-    box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] -
-            box->trans[0][1]*box->trans[1][0]) * one_vol;
-
-    //   for (i=0; i < 3; i++)
-    //     {
-    //       for (j=0; j < 3; j++)
-    //     fprintf(stderr,"%lf\t",box->trans[i][j]);
-    //       fprintf(stderr,"\n");
-    //     }
-    //   fprintf(stderr,"\n");
-    //   for (i=0; i < 3; i++)
-    //     {
-    //       for (j=0; j < 3; j++)
-    //     fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
-    //       fprintf(stderr,"\n");
-    //     }
-
-
-    box->g[0][0] = box->box[0][0] * box->box[0][0] +
-        box->box[0][1] * box->box[0][1] +
-        box->box[0][2] * box->box[0][2];
-    box->g[1][0] = 
-        box->g[0][1] = box->box[0][0] * box->box[1][0] +
-        box->box[0][1] * box->box[1][1] +
-        box->box[0][2] * box->box[1][2];
-    box->g[2][0] =
-        box->g[0][2] = box->box[0][0] * box->box[2][0] +
-        box->box[0][1] * box->box[2][1] +
-        box->box[0][2] * box->box[2][2];
-
-    box->g[1][1] = box->box[1][0] * box->box[1][0] +
-        box->box[1][1] * box->box[1][1] +
-        box->box[1][2] * box->box[1][2];
-    box->g[1][2] =
-        box->g[2][1] = box->box[1][0] * box->box[2][0] +
-        box->box[1][1] * box->box[2][1] +
-        box->box[1][2] * box->box[2][2];
-
-    box->g[2][2] = box->box[2][0] * box->box[2][0] +
-        box->box[2][1] * box->box[2][1] +
-        box->box[2][2] * box->box[2][2];
-
-    // These proportions are only used for isotropic_NPT!
-    box->side_prop[0] = box->box[0][0] / box->box[0][0];
-    box->side_prop[1] = box->box[1][1] / box->box[0][0];
-    box->side_prop[2] = box->box[2][2] / box->box[0][0];
-}
-
-
-void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    int i, j;
-    real tmp;
-
-    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
-
-    if (flag > 0) {
-        for (i=0; i < 3; i++) {
-            tmp = 0.0;
-            for (j=0; j < 3; j++)
-                tmp += box->trans[i][j]*x1[j]; 
-            x2[i] = tmp;
-        }
-    }
-    else {
-        for (i=0; i < 3; i++) {
-            tmp = 0.0;
-            for (j=0; j < 3; j++)
-                tmp += box->trans_inv[i][j]*x1[j]; 
-            x2[i] = tmp;
-        }
-    }
-    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);  
-}
-
-
-void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    Transform( x1, box, flag, x2 );
-
-    x2[0] /= box->box_norms[0];
-    x2[1] /= box->box_norms[1];
-    x2[2] /= box->box_norms[2];
-}
-
-
 void Distance_on_T3_Gen( rvec x1, rvec x2, simulation_box* box, rvec r )
 {
     rvec xa, xb, ra;
@@ -301,12 +269,12 @@ void Inc_on_T3_Gen( rvec x, rvec dx, simulation_box* box )
 real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 {
     int i, j;
-    real dist=0.0, tmp;
+    real dist = 0.0, tmp;
 
-    for( i = 0; i < 3; i++ )
+    for ( i = 0; i < 3; i++ )
     {
         tmp = 0.0;
-        for( j = 0; j < 3; j++ )
+        for ( j = 0; j < 3; j++ )
             tmp += box->g[i][j] * x2[j];
         dist += x1[i] * tmp;
     }
@@ -315,12 +283,59 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 }
 
 
-/* Determines if the distance between x1 and x2 is < vlist_cut. 
+int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
+                       real cutoff, far_neighbor_data *data )
+{
+    real norm_sqr, d, tmp;
+    int i;
+
+    norm_sqr = 0;
+
+    for ( i = 0; i < 3; i++ )
+    {
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+
+        if ( tmp >= SQR( box->box_norms[i] / 2.0 ) )
+        {
+            if ( x2[i] > x1[i] )
+            {
+                d -= box->box_norms[i];
+                data->rel_box[i] = -1;
+            }
+            else
+            {
+                d += box->box_norms[i];
+                data->rel_box[i] = +1;
+            }
+
+            data->dvec[i] = d;
+            norm_sqr += SQR(d);
+        }
+        else
+        {
+            data->dvec[i] = d;
+            norm_sqr += tmp;
+            data->rel_box[i] = 0;
+        }
+    }
+
+    if ( norm_sqr <= SQR(cutoff) )
+    {
+        data->d = sqrt(norm_sqr);
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+
+/* Determines if the distance between x1 and x2 is < vlist_cut.
    If so, this neighborhood is added to the list of far neighbors.
    Periodic boundary conditions do not apply. */
-void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-        control_params *control, 
-        far_neighbor_data *new_nbrs, int *count )
+void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
+                                    control_params *control,
+                                    far_neighbor_data *new_nbrs, int *count )
 {
     real norm_sqr;
 
@@ -328,7 +343,8 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
 
     norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec );
 
-    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+    if ( norm_sqr <= SQR( control->vlist_cut ) )
+    {
         *count = 1;
         new_nbrs[0].d = SQRT( norm_sqr );
 
@@ -341,11 +357,11 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
 
 /* Finds periodic neighbors in a 'big_box'. Here 'big_box' means:
    the current simulation box has all dimensions > 2 *vlist_cut.
-   If the periodic distance between x1 and x2 is than vlist_cut, this 
+   If the periodic distance between x1 and x2 is than vlist_cut, this
    neighborhood is added to the list of far neighbors. */
-void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box, 
-        control_params *control, 
-        far_neighbor_data *periodic_nbrs, 
+void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
+        control_params *control,
+        far_neighbor_data *periodic_nbrs,
         int *count )
 {
     real norm_sqr, d, tmp;
@@ -353,19 +369,23 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
 
     norm_sqr = 0;
 
-    for( i = 0; i < 3; i++ ) {
+    for ( i = 0; i < 3; i++ )
+    {
         d = x2[i] - x1[i];
         tmp = SQR(d);
         // fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n",
         // i,tmp,SQR(box->box_norms[i]/2.0));
 
-        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-            if( x2[i] > x1[i] ) {
+        if ( tmp >= SQR( box->box_norms[i] / 2.0 ) )
+        {
+            if ( x2[i] > x1[i] )
+            {
                 d -= box->box_norms[i];
                 periodic_nbrs[0].rel_box[i] = -1;
                 // periodic_nbrs[0].ext_factor[i] = +1;
             }
-            else {
+            else
+            {
                 d += box->box_norms[i];
                 periodic_nbrs[0].rel_box[i] = +1;
                 // periodic_nbrs[0].ext_factor[i] = -1;
@@ -374,15 +394,17 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
             periodic_nbrs[0].dvec[i] = d;
             norm_sqr += SQR(d);
         }
-        else {
+        else
+        {
             periodic_nbrs[0].dvec[i] = d;
             norm_sqr += tmp;
             periodic_nbrs[0].rel_box[i]   = 0;
             // periodic_nbrs[0].ext_factor[i] = 0;
-        } 
+        }
     }
 
-    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+    if ( norm_sqr <= SQR( control->vlist_cut ) )
+    {
         *count = 1;
         periodic_nbrs[0].d = SQRT( norm_sqr );
     }
@@ -390,16 +412,16 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
 }
 
 
-/* Finds all periodic far neighborhoods between x1 and x2 
+/* Finds all periodic far neighborhoods between x1 and x2
    ((dist(x1, x2') < vlist_cut, periodic images of x2 are also considered).
    Here the box is 'small' meaning that at least one dimension is < 2*vlist_cut.
-IMPORTANT: This part might need some improvement. In NPT, the simulation box 
-might get too small (such as <5 A!). In this case we have to consider the 
-periodic images of x2 that are two boxs away!!!
- */
+   IMPORTANT: This part might need some improvement. In NPT, the simulation box
+   might get too small (such as <5 A!). In this case we have to consider the
+   periodic images of x2 that are two boxs away!!!
+*/
 void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box,
-        control_params *control, 
-        far_neighbor_data *periodic_nbrs, 
+        control_params *control,
+        far_neighbor_data *periodic_nbrs,
         int *count )
 {
     int i, j, k;
@@ -418,14 +440,18 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
       imax, jmax, kmax ); */
 
 
-    for( i = -imax; i <= imax; ++i )
-        if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) {
-            for( j = -jmax; j <= jmax; ++j )
-                if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) {
-                    for( k = -kmax; k <= kmax; ++k )
-                        if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) {
+    for ( i = -imax; i <= imax; ++i )
+        if (fabs(d_i = ((x2[0] + i * box->box_norms[0]) - x1[0])) <= control->vlist_cut)
+        {
+            for ( j = -jmax; j <= jmax; ++j )
+                if (fabs(d_j = ((x2[1] + j * box->box_norms[1]) - x1[1])) <= control->vlist_cut)
+                {
+                    for ( k = -kmax; k <= kmax; ++k )
+                        if (fabs(d_k = ((x2[2] + k * box->box_norms[2]) - x1[2])) <= control->vlist_cut)
+                        {
                             sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k);
-                            if( sqr_norm <= SQR(control->vlist_cut) ) {
+                            if ( sqr_norm <= SQR(control->vlist_cut) )
+                            {
                                 periodic_nbrs[ *count ].d = SQRT( sqr_norm );
 
                                 periodic_nbrs[ *count ].dvec[0] = d_i;
@@ -466,21 +492,21 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
 
 /* Returns the mapping for the neighbor box pointed by (ix,iy,iz) */
 /*int Get_Nbr_Box( simulation_box *box, int ix, int iy, int iz )
-  {
-  return (9 * ix + 3 * iy + iz + 13);  
-// 13 is to handle negative indexes properly
+{
+  return (9 * ix + 3 * iy + iz + 13);
+  // 13 is to handle negative indexes properly
 }*/
 
 
 /* Returns total pressure vector for the neighbor box pointed by (ix,iy,iz) */
 /*rvec Get_Nbr_Box_Press( simulation_box *box, int ix, int iy, int iz )
-  {
+{
   int map;
 
-  map = 9 * ix + 3 * iy + iz + 13;  
-// 13 is to adjust -1,-1,-1 correspond to index 0
+  map = 9 * ix + 3 * iy + iz + 13;
+  // 13 is to adjust -1,-1,-1 correspond to index 0
 
-return box->nbr_box_press[map];
+  return box->nbr_box_press[map];
 }*/
 
 
@@ -489,53 +515,53 @@ return box->nbr_box_press[map];
   {
   int map;
 
-  map = 9 * ix + 3 * iy + iz + 13;  
-// 13 is to adjust -1,-1,-1 correspond to index 0
+  map = 9 * ix + 3 * iy + iz + 13;
+  // 13 is to adjust -1,-1,-1 correspond to index 0
 
-rvec_Add( box->nbr_box_press[map], v );
+  rvec_Add( box->nbr_box_press[map], v );
 }*/
 
 
 /* Increments the total pressure vector for the neighbor box mapped to 'map' */
 /*void Inc_Nbr_Box_Press( simulation_box *box, int map, rvec v )
-  {
+{
   rvec_Add( box->nbr_box_press[map], v );
-  }*/
+}*/
 
 
-void Print_Box_Information( simulation_box* box, FILE *out )
+void Print_Box( simulation_box* box, FILE *out )
 {
     int i, j;
 
     fprintf( out, "box: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->box[i][j] );
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
 
-    fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", 
-            box->volume, 
-            box->box_norms[0], box->box_norms[1], box->box_norms[2] );
+    fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n",
+             box->volume,
+             box->box_norms[0], box->box_norms[1], box->box_norms[2] );
 
     fprintf( out, "box_trans: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->trans[i][j] );
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
 
     fprintf( out, "box_trinv: {" );
-    for( i = 0; i < 3; ++i )
+    for ( i = 0; i < 3; ++i )
     {
         fprintf( out, "{" );
-        for( j = 0; j < 3; ++j )
+        for ( j = 0; j < 3; ++j )
             fprintf( out, "%8.3f ", box->trans_inv[i][j] );
         fprintf( out, "}" );
     }
diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h
index 418aa6208a81fb05ee56ff09afc6ff76751f75c9..84f8371becafbc86180e75b7b8a189c299c82493 100644
--- a/PuReMD-GPU/src/box.h
+++ b/PuReMD-GPU/src/box.h
@@ -25,9 +25,7 @@
 #include "mytypes.h"
 
 
-/* Initializes box from CRYST1 line of PDB */
-void Init_Box_From_CRYST(real, real, real, real, real, real,
-        simulation_box*/*, int*/);
+void Setup_Box( real, real, real, real, real, real, simulation_box* );
 
 /* Initializes box from box rtensor */
 void Update_Box(rtensor, simulation_box* /*, int*/);
@@ -36,14 +34,9 @@ void Update_Box_SemiIsotropic( simulation_box*, rvec /*, int*/ );
 
 /* Computes all the transformations,
    metric and other quantities from box rtensor */
-void Make_Consistent(simulation_box*/*, int*/ );
-
-/* Applies transformation to and from
-   Cartesian to Triclinic coordinates based on flag */
-/* Use -1 flag for Cartesian -> Triclinic and +1 for otherway */
-void Transform( rvec, simulation_box*, char, rvec );
-void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
+void Make_Consistent( simulation_box* );
 
+int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
 void Get_NonPeriodic_Far_Neighbors( rvec, rvec, simulation_box*,
         control_params*, far_neighbor_data*, int* );
 void Get_Periodic_Far_Neighbors_Big_Box( rvec, rvec, simulation_box*,
@@ -59,10 +52,6 @@ void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );*/
 
 /* These functions assume that the coordinates are in triclinic system */
 /* this function returns cartesian norm but triclinic distance vector */
-real Metric_Product( rvec, rvec, simulation_box* );
-
-void Print_Box_Information( simulation_box*, FILE* );
-
 static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
 {
 
@@ -113,5 +102,9 @@ static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
     }
 }
 
+real Metric_Product( rvec, rvec, simulation_box* );
+
+void Print_Box( simulation_box*, FILE* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/control.c b/PuReMD-GPU/src/control.c
new file mode 100644
index 0000000000000000000000000000000000000000..41f744969f1615ba621d85db852d998e92719b86
--- /dev/null
+++ b/PuReMD-GPU/src/control.c
@@ -0,0 +1,560 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include <ctype.h>
+
+#include "control.h"
+#include "traj.h"
+#include "tool_box.h"
+
+
+char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
+        output_controls *out_control )
+{
+    char *s, **tmp;
+    int c, i;
+    real val;
+    int ival;
+
+    /* assign default values */
+    strcpy( control->sim_name, "default.sim" );
+
+    control->restart = 0;
+    out_control->restart_format = WRITE_BINARY;
+    out_control->restart_freq = 0;
+    strcpy( control->restart_from, "default.res" );
+    out_control->restart_freq = 0;
+    control->random_vel = 0;
+
+    control->reposition_atoms = 0;
+
+    control->ensemble = NVE;
+    control->nsteps = 0;
+    control->dt = 0.25;
+
+    control->geo_format = PDB;
+    control->restrict_bonds = 0;
+
+    control->periodic_boundaries = 1;
+    control->periodic_images[0] = 0;
+    control->periodic_images[1] = 0;
+    control->periodic_images[2] = 0;
+
+    control->reneighbor = 1;
+    control->vlist_cut = 0;
+    control->nbr_cut = 4.;
+    control->r_cut = 10.;
+    control->r_sp_cut = 10.;
+    control->max_far_nbrs = 1000;
+    control->bo_cut = 0.01;
+    control->thb_cut = 0.001;
+    control->hb_cut = 7.50;
+
+    control->tabulate = 0;
+
+    control->qeq_solver_type = GMRES_S;
+    control->qeq_solver_q_err = 0.000001;
+    control->qeq_domain_sparsify_enabled = FALSE;
+    control->qeq_domain_sparsity = 1.0;
+    control->pre_comp_type = ICHOLT_PC;
+    control->pre_comp_sweeps = 3;
+    control->pre_comp_refactor = 100;
+    control->pre_comp_droptol = 0.01;
+    control->pre_app_type = TRI_SOLVE_PA;
+    control->pre_app_jacobi_iters = 50;
+
+    control->T_init = 0.;
+    control->T_final = 300.;
+    control->Tau_T = 1.0;
+    control->T_mode = 0.;
+    control->T_rate = 1.;
+    control->T_freq = 1.;
+
+    control->P[0] = 0.000101325;
+    control->P[1] = 0.000101325;
+    control->P[2] = 0.000101325;
+    control->Tau_P[0]  = 500.0;
+    control->Tau_P[1]  = 500.0;
+    control->Tau_P[2]  = 500.0;
+    control->Tau_PT = 500.0;
+    control->compressibility = 1.0;
+    control->press_mode = 0;
+
+    control->remove_CoM_vel = 25;
+
+    out_control->debug_level = 0;
+    out_control->energy_update_freq = 10;
+
+    out_control->write_steps = 100;
+    out_control->traj_compress = 0;
+    out_control->write = fprintf;
+    out_control->traj_format = 0;
+    out_control->write_header =
+        (int (*)( reax_system*, control_params*,
+                  static_storage*, void* )) Write_Custom_Header;
+    out_control->append_traj_frame =
+        (int (*)( reax_system*, control_params*, simulation_data*,
+                  static_storage*, list **, void* )) Append_Custom_Frame;
+
+    strcpy( out_control->traj_title, "default_title" );
+    out_control->atom_format = 0;
+    out_control->bond_info = 0;
+    out_control->angle_info = 0;
+
+    control->molec_anal = NO_ANALYSIS;
+    control->freq_molec_anal = 0;
+    control->bg_cut = 0.3;
+    control->num_ignored = 0;
+    memset( control->ignore, 0, sizeof(int)*MAX_ATOM_TYPES );
+
+    control->dipole_anal = 0;
+    control->freq_dipole_anal = 0;
+
+    control->diffusion_coef = 0;
+    control->freq_diffusion_coef = 0;
+    control->restrict_type = 0;
+
+    /* memory allocations */
+    s = (char*) malloc(sizeof(char) * MAX_LINE);
+    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    for (i = 0; i < MAX_TOKENS; i++)
+        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
+
+    /* read control parameters file */
+    while (fgets(s, MAX_LINE, fp))
+    {
+        c = Tokenize(s, &tmp);
+
+        if ( strcmp(tmp[0], "simulation_name") == 0 )
+        {
+            strcpy( control->sim_name, tmp[1] );
+        }
+        //else if( strcmp(tmp[0], "restart") == 0 ) {
+        //  ival = atoi(tmp[1]);
+        //  control->restart = ival;
+        //}
+        else if ( strcmp(tmp[0], "restart_format") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->restart_format = ival;
+        }
+        else if ( strcmp(tmp[0], "restart_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->restart_freq = ival;
+        }
+        else if ( strcmp(tmp[0], "random_vel") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->random_vel = ival;
+        }
+        else if ( strcmp(tmp[0], "reposition_atoms") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->reposition_atoms = ival;
+        }
+        else if ( strcmp(tmp[0], "ensemble_type") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->ensemble = ival;
+        }
+        else if ( strcmp(tmp[0], "nsteps") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->nsteps = ival;
+        }
+        else if ( strcmp(tmp[0], "dt") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->dt = val * 1.e-3;  // convert dt from fs to ps!
+        }
+        else if ( strcmp(tmp[0], "periodic_boundaries") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->periodic_boundaries = ival;
+        }
+        else if ( strcmp(tmp[0], "periodic_images") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->periodic_images[0] = ival;
+            ival = atoi(tmp[2]);
+            control->periodic_images[1] = ival;
+            ival = atoi(tmp[3]);
+            control->periodic_images[2] = ival;
+        }
+        else if ( strcmp(tmp[0], "geo_format") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->geo_format = ival;
+        }
+        else if ( strcmp(tmp[0], "restrict_bonds") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->restrict_bonds = ival;
+        }
+        else if ( strcmp(tmp[0], "tabulate_long_range") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->tabulate = ival;
+        }
+        else if ( strcmp(tmp[0], "reneighbor") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->reneighbor = ival;
+        }
+        else if ( strcmp(tmp[0], "vlist_buffer") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->vlist_cut = val;
+        }
+        else if ( strcmp(tmp[0], "nbrhood_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->nbr_cut = val;
+        }
+        else if ( strcmp(tmp[0], "thb_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->thb_cut = val;
+        }
+        else if ( strcmp(tmp[0], "hbond_cutoff") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->hb_cut = val;
+        }
+        else if ( strcmp(tmp[0], "qeq_solver_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->qeq_solver_type = ival;
+        }
+        else if ( strcmp(tmp[0], "qeq_solver_q_err") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->qeq_solver_q_err = val;
+        }
+        else if ( strcmp(tmp[0], "qeq_domain_sparsity") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->qeq_domain_sparsity = val;
+            control->qeq_domain_sparsify_enabled = TRUE;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_type = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_refactor") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_refactor = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_droptol") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->pre_comp_droptol = val;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_sweeps") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_sweeps = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_app_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_app_type = ival;
+        }
+        else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_app_jacobi_iters = ival;
+        }
+        else if ( strcmp(tmp[0], "temp_init") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_init = val;
+
+            if ( control->T_init < 0.001 )
+                control->T_init = 0.001;
+        }
+        else if ( strcmp(tmp[0], "temp_final") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_final = val;
+
+            if ( control->T_final < 0.1 )
+                control->T_final = 0.1;
+        }
+        else if ( strcmp(tmp[0], "t_mass") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->Tau_T = val * 1.e-3;    // convert t_mass from fs to ps
+        }
+        else if ( strcmp(tmp[0], "t_mode") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->T_mode = ival;
+        }
+        else if ( strcmp(tmp[0], "t_rate") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_rate = val;
+        }
+        else if ( strcmp(tmp[0], "t_freq") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->T_freq = val;
+        }
+        else if ( strcmp(tmp[0], "pressure") == 0 )
+        {
+            if ( control->ensemble == iNPT )
+            {
+                val = atof(tmp[1]);
+                control->P[0] = control->P[1] = control->P[2] = val;
+            }
+            else if ( control->ensemble == sNPT )
+            {
+                val = atof(tmp[1]);
+                control->P[0] = val;
+
+                val = atof(tmp[2]);
+                control->P[1] = val;
+
+                val = atof(tmp[3]);
+                control->P[2] = val;
+            }
+        }
+        else if ( strcmp(tmp[0], "p_mass") == 0 )
+        {
+            if ( control->ensemble == iNPT )
+            {
+                val = atof(tmp[1]);
+                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
+            }
+            else if ( control->ensemble == sNPT )
+            {
+                val = atof(tmp[1]);
+                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
+
+                val = atof(tmp[2]);
+                control->Tau_P[1] = val * 1.e-3;   // convert p_mass from fs to ps
+
+                val = atof(tmp[3]);
+                control->Tau_P[2] = val * 1.e-3;   // convert p_mass from fs to ps
+            }
+        }
+        else if ( strcmp(tmp[0], "pt_mass") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->Tau_PT = val * 1.e-3;  // convert pt_mass from fs to ps
+        }
+        else if ( strcmp(tmp[0], "compress") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->compressibility = val;
+        }
+        else if ( strcmp(tmp[0], "press_mode") == 0 )
+        {
+            val = atoi(tmp[1]);
+            control->press_mode = val;
+        }
+        else if ( strcmp(tmp[0], "remove_CoM_vel") == 0 )
+        {
+            val = atoi(tmp[1]);
+            control->remove_CoM_vel = val;
+        }
+        else if ( strcmp(tmp[0], "debug_level") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->debug_level = ival;
+        }
+        else if ( strcmp(tmp[0], "energy_update_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->energy_update_freq = ival;
+        }
+        else if ( strcmp(tmp[0], "write_freq") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->write_steps = ival;
+        }
+        else if ( strcmp(tmp[0], "traj_compress") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->traj_compress = ival;
+
+            if ( out_control->traj_compress )
+                out_control->write = (int (*)(FILE *, const char *, ...)) gzprintf;
+            else out_control->write = fprintf;
+        }
+        else if ( strcmp(tmp[0], "traj_format") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->traj_format = ival;
+
+            if ( out_control->traj_format == 0 )
+            {
+                out_control->write_header =
+                    (int (*)( reax_system*, control_params*,
+                              static_storage*, void* )) Write_Custom_Header;
+                out_control->append_traj_frame =
+                    (int (*)(reax_system*, control_params*, simulation_data*,
+                             static_storage*, list **, void*)) Append_Custom_Frame;
+            }
+            else if ( out_control->traj_format == 1 )
+            {
+                out_control->write_header =
+                    (int (*)( reax_system*, control_params*,
+                              static_storage*, void* )) Write_xyz_Header;
+                out_control->append_traj_frame =
+                    (int (*)( reax_system*,  control_params*, simulation_data*,
+                              static_storage*, list **, void* )) Append_xyz_Frame;
+            }
+        }
+        else if ( strcmp(tmp[0], "traj_title") == 0 )
+        {
+            strcpy( out_control->traj_title, tmp[1] );
+        }
+        else if ( strcmp(tmp[0], "atom_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 4;
+        }
+        else if ( strcmp(tmp[0], "atom_velocities") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 2;
+        }
+        else if ( strcmp(tmp[0], "atom_forces") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->atom_format += ival * 1;
+        }
+        else if ( strcmp(tmp[0], "bond_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->bond_info = ival;
+        }
+        else if ( strcmp(tmp[0], "angle_info") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            out_control->angle_info = ival;
+        }
+        else if ( strcmp(tmp[0], "test_forces") == 0 )
+        {
+            ival = atoi(tmp[1]);
+        }
+        else if ( strcmp(tmp[0], "molec_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->molec_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_molec_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_molec_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "bond_graph_cutoff") == 0 )
+        {
+            val = atof(tmp[1]);
+            control->bg_cut = val;
+        }
+        else if ( strcmp(tmp[0], "ignore") == 0 )
+        {
+            control->num_ignored = atoi(tmp[1]);
+            for ( i = 0; i < control->num_ignored; ++i )
+                control->ignore[atoi(tmp[i + 2])] = 1;
+        }
+        else if ( strcmp(tmp[0], "dipole_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->dipole_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_dipole_anal") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_dipole_anal = ival;
+        }
+        else if ( strcmp(tmp[0], "diffusion_coef") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->diffusion_coef = ival;
+        }
+        else if ( strcmp(tmp[0], "freq_diffusion_coef") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->freq_diffusion_coef = ival;
+        }
+        else if ( strcmp(tmp[0], "restrict_type") == 0 )
+        {
+            ival = atoi(tmp[1]);
+            control->restrict_type = ival;
+        }
+        else
+        {
+            fprintf( stderr, "WARNING: unknown parameter %s\n", tmp[0] );
+            exit( UNKNOWN_OPTION );
+        }
+    }
+
+    if (ferror(fp))
+    {
+        fprintf(stderr, "Error reading control file. Terminating.\n");
+        exit( INVALID_INPUT );
+    }
+
+    /* determine target T */
+    if ( control->T_mode == 0 )
+        control->T = control->T_final;
+    else control->T = control->T_init;
+
+
+    /* near neighbor and far neighbor cutoffs */
+    control->bo_cut = 0.01 * system->reaxprm.gp.l[29];
+    control->r_low  = system->reaxprm.gp.l[11];
+    control->r_cut  = system->reaxprm.gp.l[12];
+    control->r_sp_cut  = control->r_cut * control->qeq_domain_sparsity;
+    control->vlist_cut += control->r_cut;
+
+    system->g.cell_size = control->vlist_cut / 2.;
+    for ( i = 0; i < 3; ++i )
+    {
+        system->g.spread[i] = 2;
+    }
+
+    /* free memory allocations at the top */
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        free( tmp[i] );
+    }
+    free( tmp );
+    free( s );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr,
+             "en=%d steps=%d dt=%.5f opt=%d T=%.5f P=%.5f %.5f %.5f\n",
+             control->ensemble, control->nsteps, control->dt, control->tabulate,
+             control->T, control->P[0], control->P[1], control->P[2] );
+
+    fprintf(stderr, "control file read\n" );
+#endif
+
+    return SUCCESS;
+}
diff --git a/PuReMD-GPU/src/control.h b/PuReMD-GPU/src/control.h
new file mode 100644
index 0000000000000000000000000000000000000000..66d0dde7b4901d7a7b42512414328a8e6b256d83
--- /dev/null
+++ b/PuReMD-GPU/src/control.h
@@ -0,0 +1,29 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CONTROL_H_
+#define __CONTROL_H_
+
+#include "mytypes.h"
+
+char Read_Control_File( FILE*, reax_system*, control_params*, output_controls* );
+
+#endif
diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu
index bf277b391ce0df0c5336ea0a0653b6863ca14fec..754668c9d9be6601aa9e9a649eb920899e39833c 100644
--- a/PuReMD-GPU/src/cuda_forces.cu
+++ b/PuReMD-GPU/src/cuda_forces.cu
@@ -36,10 +36,10 @@
 #include "cuda_three_body_interactions.h"
 #include "cuda_four_body_interactions.h"
 #include "cuda_list.h"
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 #include "cuda_reduction.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 #include "cudaProfiler.h"
 
diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu
index 1a205506e4c5ff767e02398a3859f838818c1e1a..f0252a2f564a035d7317ea0698712318ee96df5e 100644
--- a/PuReMD-GPU/src/cuda_init_md.cu
+++ b/PuReMD-GPU/src/cuda_init_md.cu
@@ -49,7 +49,7 @@
 #include "cuda_reduction.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 void Cuda_Init_System( reax_system *system, control_params *control, 
@@ -116,7 +116,9 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
     Reset_Simulation_Data( data );
 
     if( !control->restart )  
+    {
         data->step = data->prev_steps = 0;
+    }
 
     switch( control->ensemble ) {
         case NVE:
@@ -124,16 +126,18 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Cuda_Velocity_Verlet_NVE;
             break;
 
-
         case NVT:
             data->N_f = 3 * system->N + 1;
             //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
+
+            if( !control->restart || (control->restart && control->random_vel) )
+            {
                 data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
                         data->N_f * K_B * control->T );
                 data->therm.v_xi = data->therm.G_xi * control->dt;
                 data->therm.v_xi_old = 0;
                 data->therm.xi = 0;
+
 #if defined(DEBUG_FOCUS)
                 fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
                         data->therm.G_xi, control->Tau_T, data->E_Kin, 
@@ -144,12 +148,13 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
             break;
 
-
         case NPT: // Anisotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
             data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
+
+            if( !control->restart )
+            {
                 data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
                         data->N_f * K_B * control->T );
                 data->therm.v_xi = data->therm.G_xi * control->dt;
@@ -160,7 +165,6 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
             break;
 
-
         case sNPT: // Semi-Isotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
@@ -168,7 +172,6 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
             *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
             break;
 
-
         case iNPT: // Isotropic NPT
             fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
             exit( UNKNOWN_OPTION );
diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu
index cba0b79c39b4f9b66e5b506d11dcffb81adc488d..ab4d203139e4a8235d9035ab566fbdca7ded5c82 100644
--- a/PuReMD-GPU/src/cuda_integrate.cu
+++ b/PuReMD-GPU/src/cuda_integrate.cu
@@ -36,10 +36,10 @@
 #include "cuda_forces.h"
 #include "cuda_grid.h"
 #include "cuda_neighbors.h"
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, 
diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu
index 876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8..5cfe03dea5f5c314a7dfb60ef3d4df2e6c8f61ae 100644
--- a/PuReMD-GPU/src/cuda_neighbors.cu
+++ b/PuReMD-GPU/src/cuda_neighbors.cu
@@ -265,11 +265,10 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
             nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
             max = top [index_grid_3d(x, y, z, &g)];
 
-            for (m = 0; m < max; m++) {
+            for (m = 0; m < max; m++)
+            {
                 atom2 = nbr_atoms[m];
 
-                //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
-
                 //CHANGE ORIGINAL
                 /*
                    if (atom1 > atom2) {
diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_qeq.cu
similarity index 99%
rename from PuReMD-GPU/src/cuda_QEq.cu
rename to PuReMD-GPU/src/cuda_qeq.cu
index 033945338aa76aa4c02909f90d2ca3eb1dacad58..21c2e334be0d6ec46d0a42f4c4f8a44dfc0ebffc 100644
--- a/PuReMD-GPU/src/cuda_QEq.cu
+++ b/PuReMD-GPU/src/cuda_qeq.cu
@@ -18,14 +18,15 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "cuda_QEq.h"
+#include "cuda_qeq.h"
 
-#include "QEq.h"
+#include "qeq.h"
 #include "allocate.h"
 #include "lin_alg.h"
 #include "list.h"
 #include "print_utils.h"
 #include "index_utils.h"
+#include "sort.h"
 #include "system_props.h"
 
 #include "cuda_copy.h"
@@ -33,9 +34,7 @@
 #include "cuda_utils.h"
 #include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
-
-#include "sort.h"
-#include "validation.h"
+#include "cuda_validation.h"
 
 
 GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A )
diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_qeq.h
similarity index 100%
rename from PuReMD-GPU/src/cuda_QEq.h
rename to PuReMD-GPU/src/cuda_qeq.h
diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu
index 1efcf28aa432f563749e49c68c67cc6b132e711e..6867857a4d58771f571fbb6810efbf926677f80f 100644
--- a/PuReMD-GPU/src/cuda_utils.cu
+++ b/PuReMD-GPU/src/cuda_utils.cu
@@ -29,7 +29,7 @@ cusparseMatDescr_t matdescriptor;
 
 void cuda_malloc( void **ptr, int size, int memset, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
+    cudaError_t retVal;
 
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
@@ -45,7 +45,8 @@ void cuda_malloc( void **ptr, int size, int memset, int err_code )
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
 
-    if ( memset ) {
+    if ( memset )
+    {
         retVal = cudaMemset( *ptr, 0, size );
         if ( retVal != cudaSuccess )
         {
@@ -59,8 +60,12 @@ void cuda_malloc( void **ptr, int size, int memset, int err_code )
 
 void cuda_free( void *ptr, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
-    if (!ptr) return;
+    cudaError_t retVal;
+
+    if (!ptr)
+    {
+        return;
+    }
 
     retVal = cudaFree( ptr );
 
@@ -75,9 +80,10 @@ void cuda_free( void *ptr, int err_code )
 
 void cuda_memset( void *ptr, int data, size_t count, int err_code )
 {
-    cudaError_t retVal = cudaSuccess;
+    cudaError_t retVal;
 
     retVal = cudaMemset( ptr, data, count );
+
     if (retVal != cudaSuccess) {
         fprintf( stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr );
         fprintf( stderr, " size to memset: %d \n", count );
@@ -91,7 +97,7 @@ void cuda_memset( void *ptr, int data, size_t count, int err_code )
 
 void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid )
 {
-    cudaError_t retVal = cudaErrorNotReady;
+    cudaError_t retVal;
 
     if ( dir == cudaMemcpyHostToDevice )
     {
@@ -112,9 +118,10 @@ void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir,
 
 void copy_device( void *dest, void *src, int size, int resid )
 {
-    cudaError_t retVal = cudaErrorNotReady;
+    cudaError_t retVal;
 
     retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
+
     if ( retVal != cudaSuccess )
     {
         fprintf( stderr, "could not copy resource %d from host to device: reason %d \n",
@@ -134,6 +141,7 @@ void compute_blocks( int *blocks, int *block_size, int count )
 void compute_nearest_pow_2( int blocks, int *result )
 {
     int power = 1;
+
     while (power < blocks)
     {
         power *= 2;
@@ -146,7 +154,9 @@ void compute_nearest_pow_2( int blocks, int *result )
 void print_device_mem_usage( )
 {
     size_t total, free;
+
     cudaMemGetInfo( &free, &total );
+
     if ( cudaGetLastError() != cudaSuccess )
     {
         fprintf( stderr, "Error on the memory call \n" );
diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/cuda_validation.cu
similarity index 99%
rename from PuReMD-GPU/src/validation.cu
rename to PuReMD-GPU/src/cuda_validation.cu
index 21cd2145e689621ee0b3827889b106ed7c05af7f..b5348eba0871017a7225fc311e98d95531136792 100644
--- a/PuReMD-GPU/src/validation.cu
+++ b/PuReMD-GPU/src/cuda_validation.cu
@@ -18,13 +18,13 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "validation.h"
+#include "cuda_validation.h"
 
-#include "cuda_utils.h"
+#include "index_utils.h"
 #include "list.h"
-
 #include "sort.h"
-#include "index_utils.h"
+
+#include "cuda_utils.h"
 
 
 int check_zero (real p1, real p2)
diff --git a/PuReMD-GPU/src/validation.h b/PuReMD-GPU/src/cuda_validation.h
similarity index 100%
rename from PuReMD-GPU/src/validation.h
rename to PuReMD-GPU/src/cuda_validation.h
diff --git a/PuReMD-GPU/src/param.c b/PuReMD-GPU/src/ffield.c
similarity index 58%
rename from PuReMD-GPU/src/param.c
rename to PuReMD-GPU/src/ffield.c
index 42e9ef612ec6c81d593f74acd796a7f564f96896..a5377e6f2b75dc6c6bec586d36517127bca40d50 100644
--- a/PuReMD-GPU/src/param.c
+++ b/PuReMD-GPU/src/ffield.c
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -18,85 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "param.h"
-#include "traj.h"
-#include "ctype.h"
-
-
-int Get_Atom_Type( reax_interaction *reaxprm, char *s )
-{
-    int i;
-
-    for ( i = 0; i < reaxprm->num_atom_types; ++i )
-        if ( !strcmp( reaxprm->sbp[i].name, s ) )
-            return i;
-
-    fprintf( stderr, "Unknown atom type %s. Terminating...\n", s );
-    exit( UNKNOWN_ATOM_TYPE_ERR );
-}
-
-
-int Tokenize(char* s, char*** tok)
-{
-    char test[MAX_LINE];
-    char *sep = "\t \n!=";
-    char *word;
-    int count = 0;
-
-    strncpy( test, s, MAX_LINE );
-
-    // fprintf( stderr, "|%s|\n", test );
-
-    for ( word = strtok(test, sep); word; word = strtok(NULL, sep) )
-    {
-        strncpy( (*tok)[count], word, MAX_LINE );
-        count++;
-    }
-
-    return count;
-}
-
-
-/* Initialize Taper params */
-void Init_Taper( control_params *control )
-{
-    real d1, d7;
-    real swa, swa2, swa3;
-    real swb, swb2, swb3;
-
-    swa = control->r_low;
-    swb = control->r_cut;
-
-    if ( fabs( swa ) > 0.01 )
-        fprintf( stderr, "Warning: non-zero value for lower Taper-radius cutoff\n" );
-
-    if ( swb < 0 )
-    {
-        fprintf( stderr, "Negative value for upper Taper-radius cutoff\n" );
-        exit( INVALID_INPUT );
-    }
-    else if ( swb < 5 )
-        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n",
-                 swb );
-
-    d1 = swb - swa;
-    d7 = POW( d1, 7.0 );
-    swa2 = SQR( swa );
-    swa3 = CUBE( swa );
-    swb2 = SQR( swb );
-    swb3 = CUBE( swb );
-
-    control->Tap7 =  20.0 / d7;
-    control->Tap6 = -70.0 * (swa + swb) / d7;
-    control->Tap5 =  84.0 * (swa2 + 3.0 * swa * swb + swb2) / d7;
-    control->Tap4 = -35.0 * (swa3 + 9.0 * swa2 * swb + 9.0 * swa * swb2 + swb3 ) / d7;
-    control->Tap3 = 140.0 * (swa3 * swb + 3.0 * swa2 * swb2 + swa * swb3 ) / d7;
-    control->Tap2 = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
-    control->Tap1 = 140.0 * swa3 * swb3 / d7;
-    control->Tap0 = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                     7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
-}
+#include <ctype.h>
 
+#include "ffield.h"
+#include "tool_box.h"
 
 
 char Read_Force_Field( FILE* fp, reax_interaction* reax )
@@ -106,20 +32,20 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     char *tor_flag;
     int c, i, j, k, l, m, n, o, p, cnt;
     real val;
-
     int __N;
     int index1, index2;
 
     s = (char*) malloc(sizeof(char) * MAX_LINE);
     tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
     for (i = 0; i < MAX_TOKENS; i++)
+    {
         tmp[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN);
+    }
 
 
     /* reading first header comment */
     fgets( s, MAX_LINE, fp );
 
-
     /* line 2 is number of global parameters */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
@@ -129,7 +55,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     if (n < 1)
     {
         fprintf( stderr, "WARNING: number of globals in ffield file is 0!\n" );
-        return 1;
+        exit( INVALID_INPUT );
     }
 
     reax->gp.n_global = n;
@@ -146,20 +72,17 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         reax->gp.l[i] = val;
     }
 
-
     /* next line is number of atom types and some comments */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
     reax->num_atom_types = atoi(tmp[0]);
     __N = reax->num_atom_types;
 
-
     /* 3 lines of comments */
     fgets(s, MAX_LINE, fp);
     fgets(s, MAX_LINE, fp);
     fgets(s, MAX_LINE, fp);
 
-
     /* Allocating structures in reax_interaction */
     reax->sbp = (single_body_parameters*)
                 calloc( reax->num_atom_types, sizeof(single_body_parameters) );
@@ -194,7 +117,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         c = Tokenize( s, &tmp );
 
         for ( j = 0; j < strlen( tmp[0] ); ++j )
+        {
             reax->sbp[i].name[j] = toupper( tmp[0][j] );
+        }
 
         val = atof(tmp[1]);
         reax->sbp[i].r_s        = val;
@@ -281,6 +206,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             if ( reax->sbp[i].gamma_w > 0.5 ) // Shielding vdWaals
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 3 )
+                {
                     fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
                              "Force field parameters for element %s\n"        \
                              "indicate inner wall+shielding, but earlier\n"   \
@@ -288,9 +214,11 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                              "This may cause division-by-zero errors.\n"      \
                              "Keeping vdWaals-setting for earlier atoms.\n",
                              reax->sbp[i].name );
+                }
                 else
                 {
                     reax->gp.vdw_type = 3;
+
 #if defined(DEBUG)
                     fprintf( stderr, "vdWaals type for element %s: Shielding+inner-wall",
                              reax->sbp[i].name );
@@ -300,6 +228,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             else    // No shielding vdWaals parameters present
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 2 )
+                {
                     fprintf( stderr, "Warning: inconsistent vdWaals-parameters\n" \
                              "Force field parameters for element %s\n"        \
                              "indicate inner wall without shielding, but earlier\n" \
@@ -307,9 +236,11 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                              "This may cause division-by-zero errors.\n"      \
                              "Keeping vdWaals-setting for earlier atoms.\n",
                              reax->sbp[i].name );
+                }
                 else
                 {
                     reax->gp.vdw_type = 2;
+
 #if defined(DEBUG)
                     fprintf( stderr, "vdWaals type for element%s: No Shielding,inner-wall",
                              reax->sbp[i].name );
@@ -348,7 +279,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* next line is number of two body combination and some comments */
     fgets(s, MAX_LINE, fp);
     c = Tokenize(s, &tmp);
@@ -430,6 +360,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     /* calculating combination rules and filling up remaining fields. */
 
     for (i = 0; i < reax->num_atom_types; i++)
+    {
         for (j = i; j < reax->num_atom_types; j++)
         {
             index1 = i * __N + j;
@@ -450,7 +381,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             reax->tbp[index2].r_pp = 0.5 *
                                      (reax->sbp[j].r_pi_pi + reax->sbp[i].r_pi_pi);
 
-
             reax->tbp[index1].p_boc3 =
                 sqrt(reax->sbp[i].b_o_132 *
                      reax->sbp[j].b_o_132);
@@ -472,7 +402,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                 sqrt(reax->sbp[j].b_o_133 *
                      reax->sbp[i].b_o_133);
 
-
             reax->tbp[index1].D =
                 sqrt(reax->sbp[i].epsilon *
                      reax->sbp[j].epsilon);
@@ -505,9 +434,8 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
             reax->tbp[index2].gamma =
                 POW(reax->sbp[j].gamma *
                     reax->sbp[i].gamma, -1.5);
-
         }
-
+    }
 
     /* next line is number of 2-body offdiagonal combinations and some comments */
     /* these are two body offdiagonal terms that are different from the
@@ -572,7 +500,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* 3-body parameters -
        supports multi-well potentials (upto MAX_3BODY_PARAM in mytypes.h) */
     /* clear entries first */
@@ -636,7 +563,6 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* 4-body parameters are entered in compact form. i.e. 0-X-Y-0
        correspond to any type of pair of atoms in 1 and 4
        position. However, explicit X-Y-Z-W takes precedence over the
@@ -647,13 +573,19 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
 
     /* clear all entries first */
     for ( i = 0; i < reax->num_atom_types; ++i )
+    {
         for ( j = 0; j < reax->num_atom_types; ++j )
+        {
             for ( k = 0; k < reax->num_atom_types; ++k )
+            {
                 for ( m = 0; m < reax->num_atom_types; ++m )
                 {
                     reax->fbp[i * __N * __N * __N + j * __N * __N + k * __N + m].cnt = 0;
                     tor_flag[i * __N * __N * __N + j * __N * __N + k * __N + m] = 0;
                 }
+            }
+        }
+    }
 
     /* next line is number of 4-body params and some comments */
     fgets( s, MAX_LINE, fp );
@@ -714,7 +646,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         else /* This means the entry is of the form 0-X-Y-0 */
         {
             if ( k < reax->num_atom_types && m < reax->num_atom_types )
+            {
                 for ( p = 0; p < reax->num_atom_types; p++ )
+                {
                     for ( o = 0; o < reax->num_atom_types; o++ )
                     {
                         index1 = p * __N * __N * __N + k * __N * __N + m * __N + o;
@@ -743,11 +677,12 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                             reax->fbp[index2].prm[0].p_cot1 = atof(tmp[8]);
                         }
                     }
+                }
+            }
         }
     }
 
 
-
     /* next line is number of hydrogen bond params and some comments */
     fgets( s, MAX_LINE, fp );
     c = Tokenize( s, &tmp );
@@ -781,14 +716,14 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         }
     }
 
-
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
+    {
         free( tmp[i] );
+    }
     free( tmp );
     free( s );
 
-
     /* deallocate tor_flag */
     free( tor_flag );
 
@@ -796,497 +731,5 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     fprintf( stderr, "force field read\n" );
 #endif
 
-    return 0;
-}
-
-
-char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
-                        output_controls *out_control )
-{
-    char *s, **tmp;
-    int c, i;
-    real val;
-    int ival;
-
-    /* assign default values */
-    strcpy( control->sim_name, "default.sim" );
-
-    control->restart = 0;
-    out_control->restart_format = 1;
-    out_control->restart_freq = 0;
-    strcpy( control->restart_from, "default.res" );
-    out_control->restart_freq = 0;
-    control->random_vel = 0;
-
-    control->reposition_atoms = 0;
-
-    control->ensemble = 0;
-    control->nsteps = 0;
-    control->dt = 0.25;
-
-    control->geo_format = 1;
-    control->restrict_bonds = 0;
-
-    control->periodic_boundaries = 1;
-    control->periodic_images[0] = 0;
-    control->periodic_images[1] = 0;
-    control->periodic_images[2] = 0;
-
-    control->reneighbor = 1;
-    control->vlist_cut = 0;
-    control->nbr_cut = 4.;
-    control->r_cut = 10;
-    control->max_far_nbrs = 1000;
-    control->bo_cut = 0.01;
-    control->thb_cut = 0.001;
-    control->hb_cut = 7.50;
-
-    control->q_err = 0.000001;
-    control->tabulate = 0;
-    //TODO
-    control->refactor = 100;
-    //TODO -- change this to 5.
-
-    control->droptol = 0.01;
-
-    control->T_init = 0.;
-    control->T_final = 300.;
-    control->Tau_T = 1.0;
-    control->T_mode = 0.;
-    control->T_rate = 1.;
-    control->T_freq = 1.;
-
-    control->P[0] = 0.000101325;
-    control->P[1] = 0.000101325;
-    control->P[2] = 0.000101325;
-    control->Tau_P[0]  = 500.0;
-    control->Tau_P[1]  = 500.0;
-    control->Tau_P[2]  = 500.0;
-    control->Tau_PT = 500.0;
-    control->compressibility = 1.0;
-    control->press_mode = 0;
-
-    control->remove_CoM_vel = 25;
-
-    out_control->debug_level = 0;
-    out_control->energy_update_freq = 10;
-
-    out_control->write_steps = 100;
-    out_control->traj_compress = 0;
-    out_control->write = fprintf;
-    out_control->traj_format = 0;
-    out_control->write_header =
-        (int (*)( reax_system*, control_params*,
-                  static_storage*, void* )) Write_Custom_Header;
-    out_control->append_traj_frame =
-        (int (*)( reax_system*, control_params*, simulation_data*,
-                  static_storage*, list **, void* )) Append_Custom_Frame;
-
-    strcpy( out_control->traj_title, "default_title" );
-    out_control->atom_format = 0;
-    out_control->bond_info = 0;
-    out_control->angle_info = 0;
-
-    control->molec_anal = 0;
-    control->freq_molec_anal = 0;
-    control->bg_cut = 0.3;
-    control->num_ignored = 0;
-    memset( control->ignore, 0, sizeof(int)*MAX_ATOM_TYPES );
-
-    control->dipole_anal = 0;
-    control->freq_dipole_anal = 0;
-
-    control->diffusion_coef = 0;
-    control->freq_diffusion_coef = 0;
-    control->restrict_type = 0;
-
-    /* memory allocations */
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
-    for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
-
-    /* read control parameters file */
-    while (!feof(fp))
-    {
-        fgets(s, MAX_LINE, fp);
-        c = Tokenize(s, &tmp);
-
-        if ( strcmp(tmp[0], "simulation_name") == 0 )
-        {
-            strcpy( control->sim_name, tmp[1] );
-        }
-        //else if( strcmp(tmp[0], "restart") == 0 ) {
-        //  ival = atoi(tmp[1]);
-        //  control->restart = ival;
-        //}
-        else if ( strcmp(tmp[0], "restart_format") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->restart_format = ival;
-        }
-        else if ( strcmp(tmp[0], "restart_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->restart_freq = ival;
-        }
-        else if ( strcmp(tmp[0], "random_vel") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->random_vel = ival;
-        }
-        else if ( strcmp(tmp[0], "reposition_atoms") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->reposition_atoms = ival;
-        }
-        else if ( strcmp(tmp[0], "ensemble_type") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->ensemble = ival;
-        }
-        else if ( strcmp(tmp[0], "nsteps") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->nsteps = ival;
-        }
-        else if ( strcmp(tmp[0], "dt") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->dt = val * 1.e-3;  // convert dt from fs to ps!
-        }
-        else if ( strcmp(tmp[0], "periodic_boundaries") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->periodic_boundaries = ival;
-        }
-        else if ( strcmp(tmp[0], "periodic_images") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->periodic_images[0] = ival;
-            ival = atoi(tmp[2]);
-            control->periodic_images[1] = ival;
-            ival = atoi(tmp[3]);
-            control->periodic_images[2] = ival;
-        }
-        else if ( strcmp(tmp[0], "geo_format") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->geo_format = ival;
-        }
-        else if ( strcmp(tmp[0], "restrict_bonds") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->restrict_bonds = ival;
-        }
-        else if ( strcmp(tmp[0], "tabulate_long_range") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->tabulate = ival;
-        }
-        else if ( strcmp(tmp[0], "reneighbor") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->reneighbor = ival;
-        }
-        else if ( strcmp(tmp[0], "vlist_buffer") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->vlist_cut = val;
-        }
-        else if ( strcmp(tmp[0], "nbrhood_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->nbr_cut = val;
-        }
-        else if ( strcmp(tmp[0], "thb_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->thb_cut = val;
-        }
-        else if ( strcmp(tmp[0], "hbond_cutoff") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->hb_cut = val;
-        }
-        else if ( strcmp(tmp[0], "q_err") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->q_err = val;
-        }
-        else if ( strcmp(tmp[0], "ilu_refactor") == 0 )
-        {
-            ival = atoi( tmp[1] );
-            control->refactor = ival;
-        }
-        else if ( strcmp(tmp[0], "ilu_droptol") == 0 )
-        {
-            val = atof( tmp[1] );
-            control->droptol = val;
-        }
-        else if ( strcmp(tmp[0], "temp_init") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_init = val;
-
-            if ( control->T_init < 0.001 )
-                control->T_init = 0.001;
-        }
-        else if ( strcmp(tmp[0], "temp_final") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_final = val;
-
-            if ( control->T_final < 0.1 )
-                control->T_final = 0.1;
-        }
-        else if ( strcmp(tmp[0], "t_mass") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->Tau_T = val * 1.e-3;    // convert t_mass from fs to ps
-        }
-        else if ( strcmp(tmp[0], "t_mode") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->T_mode = ival;
-        }
-        else if ( strcmp(tmp[0], "t_rate") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_rate = val;
-        }
-        else if ( strcmp(tmp[0], "t_freq") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->T_freq = val;
-        }
-        else if ( strcmp(tmp[0], "pressure") == 0 )
-        {
-            if ( control->ensemble == iNPT )
-            {
-                val = atof(tmp[1]);
-                control->P[0] = control->P[1] = control->P[2] = val;
-            }
-            else if ( control->ensemble == sNPT )
-            {
-                val = atof(tmp[1]);
-                control->P[0] = val;
-
-                val = atof(tmp[2]);
-                control->P[1] = val;
-
-                val = atof(tmp[3]);
-                control->P[2] = val;
-            }
-        }
-        else if ( strcmp(tmp[0], "p_mass") == 0 )
-        {
-            if ( control->ensemble == iNPT )
-            {
-                val = atof(tmp[1]);
-                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
-            }
-            else if ( control->ensemble == sNPT )
-            {
-                val = atof(tmp[1]);
-                control->Tau_P[0] = val * 1.e-3;   // convert p_mass from fs to ps
-
-                val = atof(tmp[2]);
-                control->Tau_P[1] = val * 1.e-3;   // convert p_mass from fs to ps
-
-                val = atof(tmp[3]);
-                control->Tau_P[2] = val * 1.e-3;   // convert p_mass from fs to ps
-            }
-        }
-        else if ( strcmp(tmp[0], "pt_mass") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->Tau_PT = val * 1.e-3;  // convert pt_mass from fs to ps
-        }
-        else if ( strcmp(tmp[0], "compress") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->compressibility = val;
-        }
-        else if ( strcmp(tmp[0], "press_mode") == 0 )
-        {
-            val = atoi(tmp[1]);
-            control->press_mode = val;
-        }
-        else if ( strcmp(tmp[0], "remove_CoM_vel") == 0 )
-        {
-            val = atoi(tmp[1]);
-            control->remove_CoM_vel = val;
-        }
-        else if ( strcmp(tmp[0], "debug_level") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->debug_level = ival;
-        }
-        else if ( strcmp(tmp[0], "energy_update_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->energy_update_freq = ival;
-        }
-        else if ( strcmp(tmp[0], "write_freq") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->write_steps = ival;
-        }
-        else if ( strcmp(tmp[0], "traj_compress") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->traj_compress = ival;
-
-            if ( out_control->traj_compress )
-                out_control->write = (int (*)(FILE *, const char *, ...)) gzprintf;
-            else out_control->write = fprintf;
-        }
-        else if ( strcmp(tmp[0], "traj_format") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->traj_format = ival;
-
-            if ( out_control->traj_format == 0 )
-            {
-                out_control->write_header =
-                    (int (*)( reax_system*, control_params*,
-                              static_storage*, void* )) Write_Custom_Header;
-                out_control->append_traj_frame =
-                    (int (*)(reax_system*, control_params*, simulation_data*,
-                             static_storage*, list **, void*)) Append_Custom_Frame;
-            }
-            else if ( out_control->traj_format == 1 )
-            {
-                out_control->write_header =
-                    (int (*)( reax_system*, control_params*,
-                              static_storage*, void* )) Write_xyz_Header;
-                out_control->append_traj_frame =
-                    (int (*)( reax_system*,  control_params*, simulation_data*,
-                              static_storage*, list **, void* )) Append_xyz_Frame;
-            }
-        }
-        else if ( strcmp(tmp[0], "traj_title") == 0 )
-        {
-            strcpy( out_control->traj_title, tmp[1] );
-        }
-        else if ( strcmp(tmp[0], "atom_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 4;
-        }
-        else if ( strcmp(tmp[0], "atom_velocities") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 2;
-        }
-        else if ( strcmp(tmp[0], "atom_forces") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->atom_format += ival * 1;
-        }
-        else if ( strcmp(tmp[0], "bond_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->bond_info = ival;
-        }
-        else if ( strcmp(tmp[0], "angle_info") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            out_control->angle_info = ival;
-        }
-        else if ( strcmp(tmp[0], "test_forces") == 0 )
-        {
-            ival = atoi(tmp[1]);
-        }
-        else if ( strcmp(tmp[0], "molec_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->molec_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_molec_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_molec_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "bond_graph_cutoff") == 0 )
-        {
-            val = atof(tmp[1]);
-            control->bg_cut = val;
-        }
-        else if ( strcmp(tmp[0], "ignore") == 0 )
-        {
-            control->num_ignored = atoi(tmp[1]);
-            for ( i = 0; i < control->num_ignored; ++i )
-                control->ignore[atoi(tmp[i + 2])] = 1;
-        }
-        else if ( strcmp(tmp[0], "dipole_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->dipole_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_dipole_anal") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_dipole_anal = ival;
-        }
-        else if ( strcmp(tmp[0], "diffusion_coef") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->diffusion_coef = ival;
-        }
-        else if ( strcmp(tmp[0], "freq_diffusion_coef") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->freq_diffusion_coef = ival;
-        }
-        else if ( strcmp(tmp[0], "restrict_type") == 0 )
-        {
-            ival = atoi(tmp[1]);
-            control->restrict_type = ival;
-        }
-        else
-        {
-            fprintf( stderr, "WARNING: unknown parameter %s\n", tmp[0] );
-            exit( 15 );
-        }
-    }
-
-
-    /* determine target T */
-    if ( control->T_mode == 0 )
-        control->T = control->T_final;
-    else control->T = control->T_init;
-
-
-    /* near neighbor and far neighbor cutoffs */
-    control->bo_cut = 0.01 * system->reaxprm.gp.l[29];
-    control->r_low  = system->reaxprm.gp.l[11];
-    control->r_cut  = system->reaxprm.gp.l[12];
-    control->vlist_cut += control->r_cut;
-
-    system->g.cell_size = control->vlist_cut / 2.;
-    for ( i = 0; i < 3; ++i )
-        system->g.spread[i] = 2;
-
-
-    /* Initialize Taper function */
-    Init_Taper( control );
-
-
-    /* free memory allocations at the top */
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr,
-             "en=%d steps=%d dt=%.5f opt=%d T=%.5f P=%.5f %.5f %.5f\n",
-             control->ensemble, control->nsteps, control->dt, control->tabulate,
-             control->T, control->P[0], control->P[1], control->P[2] );
-
-    fprintf(stderr, "control file read\n" );
-#endif
-    return 0;
+    return SUCCESS;
 }
diff --git a/PuReMD-GPU/src/ffield.h b/PuReMD-GPU/src/ffield.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aaf32a644861b069e8cf87e2eec68aadf4d3c84
--- /dev/null
+++ b/PuReMD-GPU/src/ffield.h
@@ -0,0 +1,28 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __FFIELD_H_
+#define __FFIELD_H_
+
+#include "mytypes.h"
+char Read_Force_Field( FILE*, reax_interaction* );
+
+#endif
diff --git a/PuReMD-GPU/src/forces.c b/PuReMD-GPU/src/forces.c
index c95d4896e32f60e954d79b0b623520afb042e9ea..478eaaf6cdce2c1c082b606fd661cf1b4c8afd4d 100644
--- a/PuReMD-GPU/src/forces.c
+++ b/PuReMD-GPU/src/forces.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -26,30 +27,31 @@
 #include "two_body_interactions.h"
 #include "three_body_interactions.h"
 #include "four_body_interactions.h"
+#include "index_utils.h"
 #include "list.h"
 #include "print_utils.h"
+#include "qeq.h"
 #include "system_props.h"
-#include "QEq.h"
+#include "tool_box.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Dummy_Interaction( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+void Dummy_Interaction( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
 }
 
 
 void Init_Bonded_Force_Functions( control_params *control )
-{ 
+{
     Interaction_Functions[0] = Calculate_Bond_Orders;
     Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
     Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
     //*/Dummy_Interaction;
     Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
     Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
-    if( control->hb_cut > 0 )
+    if ( control->hb_cut > 0 )
         Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
     else Interaction_Functions[5] = Dummy_Interaction;
     Interaction_Functions[6] = Dummy_Interaction; //empty
@@ -59,127 +61,123 @@ void Init_Bonded_Force_Functions( control_params *control )
 }
 
 
-void Compute_Bonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
+void Compute_Bonded_Forces( reax_system *system, control_params *control,
+                            simulation_data *data, static_storage *workspace,
+                            list **lists, output_controls *out_control )
 {
 
     int i;
-    real t_start, t_elapsed;
+    // real t_start, t_end, t_elapsed;
 
 #ifdef TEST_ENERGY
     /* Mark beginning of a new timestep in each energy file */
-    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "bo", "ebond", "total" );
-    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
-            data->step, "atom", "nlp", "elp", "total" );
-    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eov", "total" );
-    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eun", "total" );
-    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
-    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "epen", "total" );
-    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "ecoa", "total" );
-    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "r(23)", "angle", "bo(12)", "ehb", "total" );
-    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(23)", "etor", "total" );
+    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "bo", "ebond", "total" );
+    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n",
+             data->step, "atom", "nlp", "elp", "total" );
+    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n",
+             data->step, "atom", "eov", "total" );
+    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n",
+             data->step, "atom", "eun", "total" );
+    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
+    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "epen", "total" );
+    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "angle", "bo(12)", "bo(23)", "ecoa", "total" );
+    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3",
+             "r(23)", "angle", "bo(12)", "ehb", "total" );
+    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n",
+             data->step, "atom1", "atom2", "atom3", "atom4",
+             "phi", "bo(23)", "etor", "total" );
     fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
-#endif 
-
-    /* Implement all the function calls as function pointers */
-    for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
-        //for( i = 0; i < 5; i++ ) {
-        t_start = Get_Time ();
-        (Interaction_Functions[i])(system, control, data, workspace, 
-                lists, out_control);
-        t_elapsed = Get_Timing_Info ( t_start );
-
-#ifdef __DEBUG_CUDA__
-        fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
+             data->step, "atom1", "atom2", "atom3", "atom4",
+             "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 #endif
 
+    /* Implement all the function calls as function pointers */
+    for ( i = 0; i < NO_OF_INTERACTIONS; i++ )
+    {
+        (Interaction_Functions[i])(system, control, data, workspace,
+                                   lists, out_control);
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "f%d-", i );
 #endif
 #ifdef TEST_FORCES
-        (Print_Interactions[i])(system, control, data, workspace, 
-                lists, out_control);
+        (Print_Interactions[i])(system, control, data, workspace,
+                                lists, out_control);
 #endif
     }
 }
 
 
-void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data,static_storage *workspace,
-        list** lists, output_controls *out_control )
+void Compute_NonBonded_Forces( reax_system *system, control_params *control,
+                               simulation_data *data, static_storage *workspace,
+                               list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
 #ifdef TEST_ENERGY
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "r12", "evdw", "total" );
+             data->step, "atom1", "atom2", "r12", "evdw", "total" );
     fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+             data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
 #endif
 
     t_start = Get_Time( );
     QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.QEq += t_elapsed;
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "qeq - " );
 #endif
 
     if ( control->tabulate == 0)
+    {
         vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+    }
     else
-        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
-                lists, out_control );
-
+    {
+        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
+                                      lists, out_control );
+    }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonb forces - " );
 #endif
 
 #ifdef TEST_FORCES
-    Print_vdW_Coulomb_Forces( system, control, data, workspace, 
-            lists, out_control );
+    Print_vdW_Coulomb_Forces( system, control, data, workspace,
+                              lists, out_control );
 #endif
 }
 
 
-/* This version of Compute_Total_Force computes forces from coefficients 
+/* This version of Compute_Total_Force computes forces from coefficients
    accumulated by all interaction functions. Saves enormous time & space! */
-void Compute_Total_Force( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists )
+void Compute_Total_Force( reax_system *system, control_params *control,
+                          simulation_data *data, static_storage *workspace,
+                          list **lists )
 {
     int i, pj;
     list *bonds = (*lists) + BONDS;
 
-    for( i = 0; i < system->N; ++i )
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-            if( i < bonds->select.bond_list[pj].nbr ) {
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
+    for ( i = 0; i < system->N; ++i )
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            if ( i < bonds->select.bond_list[pj].nbr )
+            {
+                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
                     Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-                else 
+                else
                     Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
             }
 }
 
 
 void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
-        int Hmax, int Htop, int num_bonds, int num_hbonds )
+                     int Hmax, int Htop, int num_bonds, int num_hbonds )
 {
     int i, flag;
     list *bonds, *hbonds;
@@ -188,92 +186,104 @@ void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
     hbonds = *lists + HBONDS;
 
     /* far neighbors */
-    if( Htop > Hmax * DANGER_ZONE ) {
+    if ( Htop > Hmax * DANGER_ZONE )
+    {
         workspace->realloc.Htop = Htop;
-        if( Htop > Hmax ) {
-            fprintf( stderr, 
-                    "step%d - ran out of space on H matrix: Htop=%d, max = %d",
-                    step, Htop, Hmax );
-            exit(INSUFFICIENT_SPACE);
+        if ( Htop > Hmax )
+        {
+            fprintf( stderr,
+                     "step%d - ran out of space on H matrix: Htop=%d, max = %d",
+                     step, Htop, Hmax );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
     /* bond list */
     flag = -1;
     workspace->realloc.num_bonds = num_bonds;
-    for( i = 0; i < n-1; ++i )
-        if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
+    for ( i = 0; i < n - 1; ++i )
+        if ( End_Index(i, bonds) >= Start_Index(i + 1, bonds) - 2 )
+        {
             workspace->realloc.bonds = 1;
-            if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
+            if ( End_Index(i, bonds) > Start_Index(i + 1, bonds) )
                 flag = i;
         }
 
-    if( flag > -1 ) {
+    if ( flag > -1 )
+    {
         fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
-        exit(INSUFFICIENT_SPACE);
-    }    
+                 step, flag, End_Index(flag, bonds), Start_Index(flag + 1, bonds) );
+        exit( INSUFFICIENT_MEMORY );
+    }
 
-    if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
+    if ( End_Index(i, bonds) >= bonds->num_intrs - 2 )
+    {
         workspace->realloc.bonds = 1;
 
-        if( End_Index(i, bonds) > bonds->num_intrs ) {
+        if ( End_Index(i, bonds) > bonds->num_intrs )
+        {
             fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-                    step, flag, End_Index(i,bonds), bonds->num_intrs );
-            exit(INSUFFICIENT_SPACE);
+                     step, flag, End_Index(i, bonds), bonds->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
 
     /* hbonds list */
-    if( workspace->num_H > 0 ) {
+    if ( workspace->num_H > 0 )
+    {
         flag = -1;
         workspace->realloc.num_hbonds = num_hbonds;
-        for( i = 0; i < workspace->num_H-1; ++i )
-            if( Num_Entries(i, hbonds) >= 
-                    (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
+        for ( i = 0; i < workspace->num_H - 1; ++i )
+            if ( Num_Entries(i, hbonds) >=
+                    (Start_Index(i + 1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE )
+            {
                 workspace->realloc.hbonds = 1;
-                if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
+                if ( End_Index(i, hbonds) > Start_Index(i + 1, hbonds) )
                     flag = i;
             }
 
-        if( flag > -1 ) {
+        if ( flag > -1 )
+        {
             fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                    step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
-            exit(INSUFFICIENT_SPACE);
+                     step, flag, End_Index(flag, hbonds), Start_Index(flag + 1, hbonds) );
+            exit( INSUFFICIENT_MEMORY );
         }
 
-        if( Num_Entries(i,hbonds) >= 
-                (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
+        if ( Num_Entries(i, hbonds) >=
+                (hbonds->num_intrs - Start_Index(i, hbonds)) * DANGER_ZONE )
+        {
             workspace->realloc.hbonds = 1;
 
-            if( End_Index(i, hbonds) > hbonds->num_intrs ) {
+            if ( End_Index(i, hbonds) > hbonds->num_intrs )
+            {
                 fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-                        step, flag, End_Index(i,hbonds), hbonds->num_intrs );
-                exit(INSUFFICIENT_SPACE);
+                         step, flag, End_Index(i, hbonds), hbonds->num_intrs );
+                exit( INSUFFICIENT_MEMORY );
             }
         }
     }
 }
 
 
-void Init_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control ) {
+void Init_Forces( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
     int ihb, jhb, ihb_top, jhb_top;
-    int flag;
+    int flag, flag_sp;
     real r_ij, r2, self_coef;
     real dr3gamij_1, dr3gamij_3, Tap;
     //real val, dif, base;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
+    real p_boc1, p_boc2;
+    sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -287,44 +297,69 @@ void Init_Forces( reax_system *system, control_params *control,
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
 
-    H = &workspace->H;
+    H = workspace->H;
+    H_sp = workspace->H_sp;
     Htop = 0;
+    H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
+        start_i = Start_Index( i, far_nbrs );
+        end_i   = End_Index( i, far_nbrs );
         H->start[i] = Htop;
+        H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
-        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+        if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
+        {
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+        }
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
 
             flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if( nbr_pj->d <= control->r_cut)
+            flag_sp = 0;
+            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            {
+                if ( nbr_pj->d <= control->r_cut )
+                {
                     flag = 1;
-                else flag = 0;
+                    if ( nbr_pj->d <= control->r_sp_cut )
+                    {
+                        flag_sp = 1;
+                    }
+                }
+                else
+                {
+                    flag = 0;
+                    flag_sp = 0;
+                }
             }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                            nbr_pj->dvec))<=SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
+            else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
+                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+            {
+                if ( nbr_pj->d <= SQR(control->r_sp_cut))
+                {
+                    flag_sp = 1;
+                }
+                nbr_pj->d = SQRT( nbr_pj->d );
                 flag = 1;
             }
 
-            if( flag ){    
+            if ( flag )
+            {
                 type_j = system->atoms[j].type;
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
@@ -338,63 +373,79 @@ void Init_Forces( reax_system *system, control_params *control,
                 Tap = Tap * r_ij + control->Tap3;
                 Tap = Tap * r_ij + control->Tap2;
                 Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;          
+                Tap = Tap * r_ij + control->Tap0;
 
                 dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
                 dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
 
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+                H->j[Htop] = j;
+                H->val[Htop] = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
                 ++Htop;
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* H_sp matrix entry */
+                if ( flag_sp )
+                {
+                    H_sp->j[H_sp_top] = j;
+                    H_sp->val[H_sp_top] = H->val[Htop - 1];
+                    ++H_sp_top;
+                }
+
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     // fprintf( stderr, "%d %d\n", atom1, atom2 );
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
+                    if ( ihb == 1 && jhb == 2 )
+                    {
                         hbonds->select.hbond_list[ihb_top].nbr = j;
                         hbonds->select.hbond_list[ihb_top].scl = 1;
                         hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                         ++ihb_top;
                         ++num_hbonds;
                     }
-                    else if( ihb == 2 && jhb == 1 ) {
+                    else if ( ihb == 2 && jhb == 1 )
+                    {
                         jhb_top = End_Index( workspace->hbond_index[j], hbonds );
                         hbonds->select.hbond_list[jhb_top].nbr = i;
                         hbonds->select.hbond_list[jhb_top].scl = -1;
                         hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        Set_End_Index( workspace->hbond_index[j], jhb_top + 1, hbonds );
                         ++num_hbonds;
                     }
                 }
 
                 /* uncorrected bond orders */
-                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
+                {
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         num_bonds += 2;
                         /****** bonds i-j and j-i ******/
                         ibond = &( bonds->select.bond_list[btop_i] );
@@ -414,7 +465,7 @@ void Init_Forces( reax_system *system, control_params *control,
                         ibond->sym_index = btop_j;
                         jbond->sym_index = btop_i;
                         ++btop_i;
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
 
                         bo_ij = &( ibond->bo_data );
                         bo_ji = &( jbond->bo_data );
@@ -428,22 +479,22 @@ void Init_Forces( reax_system *system, control_params *control,
                         Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
                         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
                         rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
                         rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
                         rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
                         rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
-                        /* Only dBOp wrt. dr_i is stored here, note that 
+                        /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ij->dBOp,
+                                    -(bo_ij->BO_s * Cln_BOp_s +
+                                      bo_ij->BO_pi * Cln_BOp_pi +
+                                      bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
                         rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
 
                         rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
@@ -461,79 +512,98 @@ void Init_Forces( reax_system *system, control_params *control,
                         /*fprintf( stderr, "%d %d %g %g %g\n",
                           i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
 
-                        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
+                        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n",
                           Cln_BOp_s, twbp->p_bo2, C12 );
-                          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
+                          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n",
                           Cln_BOp_pi, twbp->p_bo4, C34 );
                           fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
                           Cln_BOp_pi2, twbp->p_bo6, C56 );*/
                         /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
                           fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
                           fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
-                          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
+                          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n",
                           twbp->r_s, twbp->r_p, twbp->r_pp );
                           fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
 
                         /*fprintf( stderr, "\tfactors: %g %g %g\n",
-                          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
+                          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi +
                           bo_ij->BO_pi2 * Cln_BOp_pp),
                           -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
-                        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
                           bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
-                          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-                          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
+                          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
+                          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1],
                           bo_ij->dln_BOp_pi[2] );
                           fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
-                          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
+                          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1],
                           bo_ij->dln_BOp_pi2[2] );*/
 
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
                     }
                 }
             }
         }
 
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        /* diagonal entry */
+        H->j[Htop] = i;
+        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
         ++Htop;
 
+        /* diagonal entry */
+        H_sp->j[H_sp_top] = i;
+        H_sp->val[H_sp_top] = H->val[Htop - 1];
+        ++H_sp_top;
+
         Set_End_Index( i, btop_i, bonds );
-        if( ihb == 1 )
+        if ( ihb == 1 )
+        {
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-        //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
-        //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+        }
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "%d bonds start: %d, end: %d\n",
+             i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+#endif
     }
 
+#if defined(DEBUG_FOCUS)
+    printf( "Htop = %d\n", Htop );
+    printf( "H_sp_top = %d\n", H_sp_top );
+#endif
+
     // mark the end of j list
-    H->start[i] = Htop; 
+    H->start[i] = Htop;
+    H_sp->start[i] = H_sp_top;
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, 
-            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+    Validate_Lists( workspace, lists, data->step, system->N, H->m,
+            Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-            data->step, Htop, num_bonds, num_hbonds );
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
+             data->step, Htop, num_bonds, num_hbonds );
+
 #endif
 }
 
 
-void Init_Forces_Tab( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control ) {
+void Init_Forces_Tab( reax_system *system, control_params *control,
+                      simulation_data *data, static_storage *workspace,
+                      list **lists, output_controls *out_control )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
     int tmin, tmax, r;
     int ihb, jhb, ihb_top, jhb_top;
-    int flag;
+    int flag, flag_sp;
     real r_ij, r2, self_coef;
     real val, dif, base;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
+    real p_boc1, p_boc2;
+    sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -547,44 +617,67 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
 
-    H = &workspace->H;
+    H = workspace->H;
+    H_sp = workspace->H_sp;
     Htop = 0;
+    H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
         H->start[i] = Htop;
+        H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
-        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+        if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
 
             flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if(nbr_pj->d <= control->r_cut)
+            flag_sp = 0;
+            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            {
+                if (nbr_pj->d <= control->r_cut)
+                {
                     flag = 1;
-                else flag = 0;
+                    if ( nbr_pj->d <= control->r_sp_cut )
+                    {
+                        flag_sp = 1;
+                    }
+                }
+                else
+                {
+                    flag = 0;
+                    flag_sp = 0;
+                }
             }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                            nbr_pj->dvec))<=SQR(control->r_cut)){
+            else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
+                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+            {
+                if ( nbr_pj->d <= SQR(control->r_sp_cut))
+                {
+                    flag_sp = 1;
+                }
                 nbr_pj->d = sqrt(nbr_pj->d);
                 flag = 1;
             }
 
-            if( flag ){    
+            if ( flag )
+            {
                 type_j = system->atoms[j].type;
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
@@ -596,65 +689,81 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
                 /* cubic spline interpolation */
                 r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
+                if ( r == 0 )  ++r;
+                base = (real)(r + 1) * t->dx;
                 dif = r_ij - base;
-                val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                    t->ele[r].a;
+                val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                      t->ele[r].a;
                 val *= EV_to_KCALpMOL / C_ele;
 
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * val;
+                H->j[Htop] = j;
+                H->val[Htop] = self_coef * val;
                 ++Htop;
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* H_sp matrix entry */
+                if ( flag_sp )
+                {
+                    H_sp->j[H_sp_top] = j;
+                    H_sp->val[H_sp_top] = H->val[Htop - 1];
+                    ++H_sp_top;
+                }
+
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     // fprintf( stderr, "%d %d\n", atom1, atom2 );
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
+                    if ( ihb == 1 && jhb == 2 )
+                    {
                         hbonds->select.hbond_list[ihb_top].nbr = j;
                         hbonds->select.hbond_list[ihb_top].scl = 1;
                         hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                         ++ihb_top;
                         ++num_hbonds;
                     }
-                    else if( ihb == 2 && jhb == 1 ) {
+                    else if ( ihb == 2 && jhb == 1 )
+                    {
                         jhb_top = End_Index( workspace->hbond_index[j], hbonds );
                         hbonds->select.hbond_list[jhb_top].nbr = i;
                         hbonds->select.hbond_list[jhb_top].scl = -1;
                         hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        Set_End_Index( workspace->hbond_index[j], jhb_top + 1, hbonds );
                         ++num_hbonds;
                     }
                 }
 
                 /* uncorrected bond orders */
-                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
+                {
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         num_bonds += 2;
                         /****** bonds i-j and j-i ******/
                         ibond = &( bonds->select.bond_list[btop_i] );
@@ -666,6 +775,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         ibond->d = r_ij;
                         jbond->d = r_ij;
                         rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        //fprintf (stderr, " %f - %f - %f \n", nbr_pj->dvec[0], nbr_pj->dvec[1], nbr_pj->dvec[2]);
                         rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
                         ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
                         ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
@@ -674,7 +784,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         ibond->sym_index = btop_j;
                         jbond->sym_index = btop_i;
                         ++btop_i;
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
 
                         bo_ij = &( ibond->bo_data );
                         bo_ji = &( jbond->bo_data );
@@ -688,22 +798,22 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
                         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
                         rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
                         rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
                         rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
                         rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
-                        /* Only dBOp wrt. dr_i is stored here, note that 
+                        /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ij->dBOp,
+                                    -(bo_ij->BO_s * Cln_BOp_s +
+                                      bo_ij->BO_pi * Cln_BOp_pi +
+                                      bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
                         rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
 
                         rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
@@ -718,30 +828,37 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
                         bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
 
-                        Set_End_Index( j, btop_j+1, bonds );
+                        Set_End_Index( j, btop_j + 1, bonds );
                     }
                 }
             }
         }
 
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        /* diagonal entry */
+        H->j[Htop] = i;
+        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
         ++Htop;
 
+        /* diagonal entry */
+        H_sp->j[H_sp_top] = i;
+        H_sp->val[H_sp_top] = H->val[Htop - 1];
+        ++H_sp_top;
+
         Set_End_Index( i, btop_i, bonds );
-        if( ihb == 1 )
+        if ( ihb == 1 )
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
     }
 
     // mark the end of j list
-    H->start[i] = Htop; 
+    H->start[i] = Htop;
+    H_sp->start[i] = H_sp_top;
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, 
-            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+    Validate_Lists( workspace, lists,
+                    data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-            data->step, Htop, num_bonds, num_hbonds );
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
+             data->step, Htop, num_bonds, num_hbonds );
     //Print_Bonds( system, bonds, "sbonds.out" );
     //Print_Bond_List2( system, bonds, "sbonds.out" );
     //Print_Sparse_Matrix2( H, "H.out" );
@@ -749,9 +866,10 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 }
 
 
-void Estimate_Storage_Sizes( reax_system *system, control_params *control, 
-        list **lists, int *Htop, int *hb_top, 
-        int *bond_top, int *num_3body ) {
+void Estimate_Storage_Sizes( reax_system *system, control_params *control,
+                             list **lists, int *Htop, int *hb_top,
+                             int *bond_top, int *num_3body )
+{
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
@@ -759,7 +877,7 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
     real r_ij, r2;
     real C12, C34, C56;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2; 
+    real p_boc1, p_boc2;
     list *far_nbrs;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -770,7 +888,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
@@ -778,7 +897,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = sbp_i->p_hbond;
 
-        for( pj = start_i; pj < end_i; ++pj ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
             nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
             j = nbr_pj->nbr;
             atom_j = &(system->atoms[j]);
@@ -786,46 +906,53 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
             sbp_j = &(system->reaxprm.sbp[type_j]);
             twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
-            if( nbr_pj->d <= control->r_cut ) {
+            if ( nbr_pj->d <= control->r_cut )
+            {
                 ++(*Htop);
 
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
+                /* hydrogen bond lists */
+                if ( control->hb_cut > 0.1 && (ihb == 1 || ihb == 2) &&
+                        nbr_pj->d <= control->hb_cut )
+                {
                     jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 )
+                    if ( ihb == 1 && jhb == 2 )
                         ++hb_top[i];
-                    else if( ihb == 2 && jhb == 1 )
+                    else if ( ihb == 2 && jhb == 1 )
                         ++hb_top[j];
                 }
 
                 /* uncorrected bond orders */
-                if( nbr_pj->d <= control->nbr_cut ) {
+                if ( nbr_pj->d <= control->nbr_cut )
+                {
                     r_ij = nbr_pj->d;
                     r2 = SQR(r_ij);
 
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
                     else BO_s = C12 = 0.0;
 
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
                     else BO_pi = C34 = 0.0;
 
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                        BO_pi2 = EXP( C56 );
                     }
                     else BO_pi2 = C56 = 0.0;
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
 
-                    if( BO >= control->bo_cut ) {
+                    if ( BO >= control->bo_cut )
+                    {
                         ++bond_top[i];
                         ++bond_top[j];
                     }
@@ -836,8 +963,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
 
     *Htop += system->N;
     *Htop *= SAFE_ZONE;
-
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
         *num_3body += SQR(bond_top[i]);
         bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
@@ -846,49 +973,40 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
 }
 
 
-void Compute_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list** lists, output_controls *out_control )
+void Compute_Forces( reax_system *system, control_params *control,
+                     simulation_data *data, static_storage *workspace,
+                     list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
 
     t_start = Get_Time( );
-    if( !control->tabulate )
+    if ( !control->tabulate )
+    {
         Init_Forces( system, control, data, workspace, lists, out_control );
-    else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    }
+    else
+    {
+        Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    }
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.init_forces += t_elapsed;
-
 #if defined(DEBUG_FOCUS)
-    print_sparse_matrix (system, workspace);
     fprintf( stderr, "init_forces - ");
 #endif
 
-
-    //analyze_hbonds (system, workspace, lists);
-
     t_start = Get_Time( );
     Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.bonded += t_elapsed;
-
-    //print_bond_list (system, workspace, lists);
-    //exit (0);
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "bonded_forces - ");
 #endif
 
     t_start = Get_Time( );
-    Compute_NonBonded_Forces( system, control, data, workspace, 
-            lists, out_control );
+    Compute_NonBonded_Forces( system, control, data, workspace,
+                              lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.nonb += t_elapsed;
-
-#ifdef __DEBUG_CUDA__
-    fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
-#endif
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonbondeds - ");
 #endif
@@ -904,7 +1022,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     Print_Total_Force( system, control, data, workspace, lists, out_control );
     Compare_Total_Forces( system, control, data, workspace, lists, out_control );
 #endif
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "forces - ");
 #endif
 }
diff --git a/PuReMD-GPU/src/forces.h b/PuReMD-GPU/src/forces.h
index 73323f0419baf383d6bf671158ef85584a710728..0ef8b117c78de5e7b3cdc2311b4a492b4615a859 100644
--- a/PuReMD-GPU/src/forces.h
+++ b/PuReMD-GPU/src/forces.h
@@ -23,12 +23,14 @@
 
 #include "mytypes.h"
 
+
 void Init_Bonded_Force_Functions( control_params* );
 
 void Compute_Forces( reax_system*, control_params*, simulation_data*,
-                     static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Estimate_Storage_Sizes( reax_system*, control_params*, list**,
-                             int*, int*, int*, int* );
+        int*, int*, int*, int* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/four_body_interactions.c b/PuReMD-GPU/src/four_body_interactions.c
index c51601fa991203a77ec4840c10e74e15cfa42c87..25642871d23e389a96db943607f432aa68252a02 100644
--- a/PuReMD-GPU/src/four_body_interactions.c
+++ b/PuReMD-GPU/src/four_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -22,20 +23,21 @@
 
 #include "bond_orders.h"
 #include "box.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
 #include "math.h"
-#include "index_utils.h"
 
+#define MIN_SINE 1e-10
 
 real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
-        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
-        three_body_interaction_data *p_ijk, 
-        three_body_interaction_data *p_jkl, 
-        rvec dcos_omega_di, rvec dcos_omega_dj, 
-        rvec dcos_omega_dk, rvec dcos_omega_dl, 
-        output_controls *out_control )
+                      rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+                      three_body_interaction_data *p_ijk,
+                      three_body_interaction_data *p_jkl,
+                      rvec dcos_omega_di, rvec dcos_omega_dj,
+                      rvec dcos_omega_dk, rvec dcos_omega_dl,
+                      output_controls *out_control )
 {
     real unnorm_cos_omega, unnorm_sin_omega, omega;
     real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
@@ -49,11 +51,11 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
     cos_jkl = COS( p_jkl->theta );
 
     /* omega */
-    unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) +
-        SQR( r_jk ) *  rvec_Dot( dvec_ij,dvec_kl );
+    unnorm_cos_omega = -rvec_Dot( dvec_ij, dvec_jk ) * rvec_Dot( dvec_jk, dvec_kl ) +
+                       SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
-    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
+    omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
 
     /* derivatives */
     /* coef for adjusments to cos_theta's */
@@ -70,24 +72,25 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
     hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
     hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
 
+
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 ) poem = 1e-20;
 
-    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - 
-        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
-                r_jk * r_kl * cos_jkl );
+    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) -
+           2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
+                   r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if( arg >  1.0 )
+    if ( arg >  1.0 )
     {
         arg =  1.0;
     }
-    if( arg < -1.0 )
+    if ( arg < -1.0 )
     {
         arg = -1.0;
     }
 
-    /*fprintf( out_control->etor, 
+    /*fprintf( out_control->etor,
       "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
@@ -99,69 +102,72 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
       r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
-      r_ij, r_jk, r_kl, r_li ); 
-      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", 
-      cos_ijk, cos_jkl, sin_ijk, sin_jkl ); 
+      r_ij, r_jk, r_kl, r_li );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      cos_ijk, cos_jkl, sin_ijk, sin_jkl );
       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
       poem, tel, arg );*/
     /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-       -p_ijk->dcos_dk[0]/sin_ijk, 
-       -p_ijk->dcos_dk[1]/sin_ijk, 
+       -p_ijk->dcos_dk[0]/sin_ijk,
+       -p_ijk->dcos_dk[1]/sin_ijk,
        -p_ijk->dcos_dk[2]/sin_ijk );
        fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-       -p_jkl->dcos_dk[0]/sin_jkl, 
-       -p_jkl->dcos_dk[1]/sin_jkl, 
+       -p_jkl->dcos_dk[0]/sin_jkl,
+       -p_jkl->dcos_dk[1]/sin_jkl,
        -p_jkl->dcos_dk[2]/sin_jkl );*/
 
-    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
     {
         sin_ijk = MIN_SINE;
     }
-    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
     {
         sin_ijk = -MIN_SINE;
     }
-    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
     {
         sin_jkl = MIN_SINE;
     }
-    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
     {
         sin_jkl = -MIN_SINE;
     }
 
     // dcos_omega_di
-    rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
-    rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
+    rvec_ScaledSum( dcos_omega_di, (htra - arg * hnra) / r_ij, dvec_ij, -1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_di, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_dk );
     rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
 
     // dcos_omega_dj
-    rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
-            -htrb / r_jk, dvec_jk );
-    rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj );
-    rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di );
+    rvec_ScaledSum( dcos_omega_dj, -(htra - arg * hnra) / r_ij, dvec_ij,
+                    -htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dj, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_dj );
+    rvec_ScaledAdd( dcos_omega_dj, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_di );
     rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
 
     // dcos_omega_dk
-    rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl,  
-            htrb / r_jk, dvec_jk );
-    rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di );
-    rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj );
+    rvec_ScaledSum( dcos_omega_dk, -(htrc - arg * hnrc) / r_kl, dvec_kl,
+                    htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dk, -(hthd - arg * hnhd) / sin_ijk, p_ijk->dcos_di );
+    rvec_ScaledAdd( dcos_omega_dk, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_dj );
     rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
 
     // dcos_omega_dl
-    rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li );
-    rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk );
+    rvec_ScaledSum( dcos_omega_dl, (htrc - arg * hnrc) / r_kl, dvec_kl, 1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_dl, -(hthe - arg * hnhe) / sin_jkl, p_jkl->dcos_dk );
     rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
 
-    return omega;  
+    return omega;
     //return arg;
 }
 
 
-void Four_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+
+
+void Four_Body_Interactions( reax_system *system, control_params *control,
+                             simulation_data *data, static_storage *workspace,
+                             list **lists, output_controls *out_control )
 {
     int i, j, k, l, pi, pj, pk, pl, pij, plk;
     int type_i, type_j, type_k, type_l;
@@ -212,31 +218,35 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
     list *thb_intrs = (*lists) + THREE_BODIES;
 
 
-    for( j = 0; j < system->N; ++j ) {
+    for ( j = 0; j < system->N; ++j )
+    {
         type_j = system->atoms[j].type;
         Delta_j = workspace->Delta_boc[j];
         start_j = Start_Index(j, bonds);
         end_j = End_Index(j, bonds);
 
 
-        for( pk = start_j; pk < end_j; ++pk ) {
+        for ( pk = start_j; pk < end_j; ++pk )
+        {
             pbond_jk = &( bonds->select.bond_list[pk] );
             k = pbond_jk->nbr;
             bo_jk = &( pbond_jk->bo_data );
             BOA_jk = bo_jk->BO - control->thb_cut;
 
             /* see if there are any 3-body interactions involving j&k
-               where j is the central atom. Otherwise there is no point in
-               trying to form a 4-body interaction out of this neighborhood */    
-            if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
-                    Num_Entries(pk, thb_intrs) ) {
+            where j is the central atom. Otherwise there is no point in
+             trying to form a 4-body interaction out of this neighborhood */
+            if ( j < k && bo_jk->BO > control->thb_cut/*0*/ &&
+                    Num_Entries(pk, thb_intrs) )
+            {
                 start_k = Start_Index(k, bonds);
-                end_k = End_Index(k, bonds);                   
+                end_k = End_Index(k, bonds);
                 pj = pbond_jk->sym_index; // pj points to j on k's list
 
-                /* do the same check as above: are there any 3-body interactions 
+                /* do the same check as above: are there any 3-body interactions
                    involving k&j where k is the central atom */
-                if( Num_Entries(pj, thb_intrs) ) {
+                if ( Num_Entries(pj, thb_intrs) )
+                {
                     type_k = system->atoms[k].type;
                     Delta_k = workspace->Delta_boc[k];
                     r_jk = pbond_jk->d;
@@ -244,7 +254,7 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                     start_pk = Start_Index(pk, thb_intrs );
                     end_pk = End_Index(pk, thb_intrs );
                     start_pj = Start_Index(pj, thb_intrs );
-                    end_pj = End_Index(pj, thb_intrs );        
+                    end_pj = End_Index(pj, thb_intrs );
 
                     exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
                     exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
@@ -255,14 +265,16 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                     /* pick i up from j-k interaction where j is the centre atom */
-                    for( pi = start_pk; pi < end_pk; ++pi ) {
+                    for ( pi = start_pk; pi < end_pk; ++pi )
+                    {
                         p_ijk = &( thb_intrs->select.three_body_list[pi] );
                         pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
                         pbond_ij = &( bonds->select.bond_list[pij] );
                         bo_ij = &( pbond_ij->bo_data );
 
 
-                        if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                        if ( bo_ij->BO > control->thb_cut/*0*/ )
+                        {
                             i = p_ijk->thb;
                             type_i = system->atoms[i].type;
                             r_ij = pbond_ij->d;
@@ -272,17 +284,18 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                             sin_ijk = SIN( theta_ijk );
                             cos_ijk = COS( theta_ijk );
                             //tan_ijk_i = 1. / TAN( theta_ijk );
-                            if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                            if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
                                 tan_ijk_i = cos_ijk / MIN_SINE;
-                            else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                            else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
                                 tan_ijk_i = cos_ijk / -MIN_SINE;
                             else tan_ijk_i = cos_ijk / sin_ijk;
 
                             exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
 
                             /* pick l up from j-k intr. where k is the centre */
-                            for( pl = start_pj; pl < end_pj; ++pl ) {
+                            for ( pl = start_pj; pl < end_pj; ++pl )
+                            {
                                 p_jkl = &( thb_intrs->select.three_body_list[pl] );
                                 l = p_jkl->thb;
                                 plk = p_jkl->pthb; //pointer to l on k's bond_list!
@@ -292,8 +305,9 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                 fbh = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types ) ]);
                                 fbp = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types )].prm[0]);
 
-                                if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                if ( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
+                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ )
+                                {
                                     ++num_frb_intrs;
                                     r_kl = pbond_kl->d;
                                     BOA_kl = bo_kl->BO - control->thb_cut;
@@ -302,77 +316,77 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     sin_jkl = SIN( theta_jkl );
                                     cos_jkl = COS( theta_jkl );
                                     //tan_jkl_i = 1. / TAN( theta_jkl );
-                                    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
                                         tan_jkl_i = cos_jkl / MIN_SINE;
-                                    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
                                         tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl /sin_jkl;
+                                    else tan_jkl_i = cos_jkl / sin_jkl;
 
-                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
-                                            &(system->box), dvec_li );
+                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x,
+                                                       &(system->box), dvec_li );
                                     r_li = rvec_Norm( dvec_li );
 
 
                                     /* omega and its derivative */
-                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
-                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
-                                            r_jk, pbond_kl->dvec, r_kl,
-                                            dvec_li, r_li, p_ijk, p_jkl,
-                                            dcos_omega_di, dcos_omega_dj,
-                                            dcos_omega_dk, dcos_omega_dl,
-                                            out_control);
+                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec,
+                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec,
+                                                            r_jk, pbond_kl->dvec, r_kl,
+                                                            dvec_li, r_li, p_ijk, p_jkl,
+                                                            dcos_omega_di, dcos_omega_dj,
+                                                            dcos_omega_dk, dcos_omega_dl,
+                                                            out_control);
                                     cos_omega = COS( omega );
                                     cos2omega = COS( 2. * omega );
                                     cos3omega = COS( 3. * omega );
                                     /* end omega calculations */
 
                                     /* torsion energy */
-                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
+                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2. - bo_jk->BO_pi - f11_DjDk));
                                     exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
-                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-                                        (1.0 - exp_tor2_kl);
-
-                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-                                            fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-                                            fbp->V3 * (1.0 + cos3omega) );
-                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
+                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
+                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
+                                           (1.0 - exp_tor2_kl);
+
+                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) +
+                                                 fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                                 fbp->V3 * (1.0 + cos3omega) );
+                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) +
                                     //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
                                     //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
 
                                     data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
 
                                     dfn11 = (-p_tor3 * exp_tor3_DjDk +
-                                            (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-                                            (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+                                             (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                             (2. + exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
 
                                     CEtors1 = sin_ijk * sin_jkl * CV;
 
-                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
-                                        (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
-                                        sin_ijk * sin_jkl; 
+                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
+                                              (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
+                                              sin_ijk * sin_jkl;
 
                                     CEtors3 = CEtors2 * dfn11;
 
-                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-                                        (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij *
+                                              (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
 
-                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk *
+                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
 
                                     CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
 
                                     cmn = -fn10 * CV;
                                     CEtors7 = cmn * sin_jkl * tan_ijk_i;
                                     CEtors8 = cmn * sin_ijk * tan_jkl_i;
-                                    CEtors9 = fn10 * sin_ijk * sin_jkl * 
-                                        (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                         1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                    CEtors9 = fn10 * sin_ijk * sin_jkl *
+                                              (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                               1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
                                     //cmn = -fn10 * CV;
                                     //CEtors7 = cmn * sin_jkl * cos_ijk;
                                     //CEtors8 = cmn * sin_ijk * cos_jkl;
-                                    //CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    //CEtors9 = fn10 * sin_ijk * sin_jkl *
                                     //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
                                     //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
                                     /* end  of torsion energy */
@@ -380,38 +394,38 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
                                     /* 4-body conjugation energy */
                                     fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                    data->E_Con += e_con = fbp->p_cot1 * fn12 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    data->E_Con += e_con = fbp->p_cot1 * fn12 *
+                                                           (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
 
-                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
+                                            (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
 
                                     CEconj1 = Cconj * (BOA_ij - 1.5e0);
                                     CEconj2 = Cconj * (BOA_jk - 1.5e0);
                                     CEconj3 = Cconj * (BOA_kl - 1.5e0);
 
-                                    CEconj4 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-                                    CEconj5 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-                                    //CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    CEconj4 = -fbp->p_cot1 * fn12 *
+                                              (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                    CEconj5 = -fbp->p_cot1 * fn12 *
+                                              (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                    //CEconj4 = -fbp->p_cot1 * fn12 *
                                     //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-                                    //CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    //CEconj5 = -fbp->p_cot1 * fn12 *
                                     //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-                                        cos_omega * sin_ijk * sin_jkl;
+                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 *
+                                              cos_omega * sin_ijk * sin_jkl;
                                     /* end 4-body conjugation energy */
 
                                     //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
                                     //   workspace->orig_id[i], workspace->orig_id[j],
-                                    //       workspace->orig_id[k], workspace->orig_id[l], 
+                                    //       workspace->orig_id[k], workspace->orig_id[l],
                                     //    omega, cos_omega, cos2omega, cos3omega );
-                                    //fprintf(stdout, 
+                                    //fprintf(stdout,
                                     //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    CEtors2, CEtors3, CEtors4, CEtors5, 
+                                    //    CEtors2, CEtors3, CEtors4, CEtors5,
                                     //    CEtors6, CEtors7, CEtors8, CEtors9 );
                                     //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    theta_ijk, theta_jkl, sin_ijk, 
+                                    //    theta_ijk, theta_jkl, sin_ijk,
                                     //    sin_jkl, cos_jkl, tan_jkl_i );
 
                                     /* forces */
@@ -420,37 +434,38 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     workspace->CdDelta[k] += CEtors3;
                                     bo_ij->Cdbo += (CEtors4 + CEconj1);
                                     bo_jk->Cdbo += (CEtors5 + CEconj2);
-
                                     bo_kl->Cdbo += (CEtors6 + CEconj3);
 
-                                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                    if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                                    {
                                         /* dcos_theta_ijk */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[i].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_di );
 
                                         /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[l].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_dk );
 
                                         /* dcos_omega */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dl );
+                                        rvec_ScaledAdd( system->atoms[i].f,
+                                                        CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_ScaledAdd( system->atoms[l].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dl );
                                     }
-                                    else {
+                                    else
+                                    {
                                         ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
 
                                         /* dcos_theta_ijk */
@@ -459,8 +474,8 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                         rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                         rvec_Add( data->ext_press, ext_press );
 
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
 
                                         rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
                                         rvec_Add( system->atoms[k].f, force );
@@ -469,8 +484,8 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                                         /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
 
                                         rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
                                         rvec_Add( system->atoms[k].f, force );
@@ -483,14 +498,14 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                         rvec_Add( data->ext_press, ext_press );
 
 
-                                        /* dcos_omega */                      
+                                        /* dcos_omega */
                                         rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
                                         rvec_Add( system->atoms[i].f, force );
                                         rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                         rvec_Add( data->ext_press, ext_press );
 
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[j].f,
+                                                        CEtors9 + CEconj6, dcos_omega_dj );
 
                                         rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
                                         rvec_Add( system->atoms[k].f, force );
@@ -504,39 +519,39 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 
 
                                         /* This part is intended for a fully-flexible box */
-                                        /* rvec_ScaledSum( temp_rvec, 
-                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
+                                        /* rvec_ScaledSum( temp_rvec,
+                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i
                                            CEtors9 + CEconj6, dcos_omega_di );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[i].x );
                                            rtensor_Copy( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
                                            CEtors8 + CEconj5, p_jkl->dcos_di );
-                                           rvec_ScaledAdd( temp_rvec, 
+                                           rvec_ScaledAdd( temp_rvec,
                                            CEtors9 + CEconj6, dcos_omega_dj );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[j].x );
                                            rtensor_Add( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors7 + CEconj4, p_ijk->dcos_di,      // k
                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                           rvec_ScaledAdd( temp_rvec, 
+                                           rvec_ScaledAdd( temp_rvec,
                                            CEtors9 + CEconj6, dcos_omega_dk );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[k].x );
                                            rtensor_Add( total_rtensor, temp_rtensor );
 
-                                           rvec_ScaledSum( temp_rvec, 
+                                           rvec_ScaledSum( temp_rvec,
                                            CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
                                            CEtors9 + CEconj6, dcos_omega_dl );
-                                           rvec_OuterProduct( temp_rtensor, 
+                                           rvec_OuterProduct( temp_rtensor,
                                            temp_rvec, system->atoms[l].x );
                                            rtensor_Copy( total_rtensor, temp_rtensor );
 
-                                           if( pbond_ij->imaginary || pbond_jk->imaginary || 
+                                           if( pbond_ij->imaginary || pbond_jk->imaginary ||
                                            pbond_kl->imaginary )
                                            rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
                                            else
@@ -544,82 +559,82 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     }
 
 #ifdef TEST_ENERGY
-                                    /*fprintf( out_control->etor, 
-                                    //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //r_ij, r_jk, r_kl, 
-                                    "%12.8f%12.8f%12.8f%12.8f\n",
-                                    cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                    /*fprintf( out_control->etor,
+                                       //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                       //r_ij, r_jk, r_kl,
+                                       "%12.8f%12.8f%12.8f%12.8f\n",
+                                       cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
                                     // fprintf( out_control->etor, "%12.8f\n", dfn11 );
-                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
-                                            fn10, cos_omega, CV );
+                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n",
+                                             fn10, cos_omega, CV );
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEtors2, CEtors3, CEtors4, CEtors5, 
-                                            CEtors6, CEtors7, CEtors8, CEtors9 );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                             CEtors2, CEtors3, CEtors4, CEtors5,
+                                             CEtors6, CEtors7, CEtors8, CEtors9 );
 
-                                    /* fprintf( out_control->etor, 
+                                    /* fprintf( out_control->etor,
                                        "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
                                        htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                             CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
                                     /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
                                        fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
 
-                                    fprintf( out_control->etor, 
-                                            //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
-                                            "%6d%6d%6d%6d%12.8f%12.8f\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            e_tor, e_con );
+                                    fprintf( out_control->etor,
+                                             //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n",
+                                             "%6d%6d%6d%6d%12.8f%12.8f\n",
+                                             workspace->orig_id[i], workspace->orig_id[j],
+                                             workspace->orig_id[k], workspace->orig_id[l],
+                                             e_tor, e_con );
                                     //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
 
-                                    fprintf( out_control->econ, 
-                                            "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-                                            e_con,data->E_Con );
-
-                                    /* fprintf( out_control->etor, 
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
+                                    fprintf( out_control->econ,
+                                             "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                             workspace->orig_id[i], workspace->orig_id[j],
+                                             workspace->orig_id[k], workspace->orig_id[l],
+                                             RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl,
+                                             e_con, data->E_Con );
+
+                                    /* fprintf( out_control->etor,
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1],
                                        (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
 
 
-                                    /* fprintf( out_control->etor, 
+                                    /* fprintf( out_control->etor,
                                        "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0],
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1],
                                        (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
 
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                            dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
-                                            dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
-                                            dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-                                            dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                    fprintf( out_control->etor,
+                                             "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                             dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2],
+                                             dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2],
+                                             dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                             dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
 #endif
 
 #ifdef TEST_FORCES
-                                    // Torsion Forces 
-                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
-                                            workspace->f_tor, workspace->f_tor);
+                                    /* Torsion Forces */
+                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0.,
+                                                  workspace->f_tor, workspace->f_tor);
                                     Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
                                     Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
                                     Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
@@ -639,7 +654,7 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
                                     rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
                                     rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
 
-                                    // Conjugation Forces 
+                                    /* Conjugation Forces */
                                     Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
                                     Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
                                     Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
@@ -666,12 +681,12 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
         } // pk loop ends
     } // j loop
 
-    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
+    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
 #ifdef TEST_FORCES
     fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
-    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
-            data->E_Tor, data->E_Con );
+    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n",
+             data->E_Tor, data->E_Con );
 #endif
 }
diff --git a/PuReMD-GPU/src/four_body_interactions.h b/PuReMD-GPU/src/four_body_interactions.h
index 8e8dd7c0991a747000e77b2d460711e433db52ef..65e315a94f95239ad8c7081a00c32cc0d3264cd3 100644
--- a/PuReMD-GPU/src/four_body_interactions.h
+++ b/PuReMD-GPU/src/four_body_interactions.h
@@ -23,10 +23,9 @@
 
 #include "mytypes.h"
 
-#define MIN_SINE 1e-10
-
 
 void Four_Body_Interactions( reax_system*, control_params*, simulation_data*,
-    static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
+
 
 #endif
diff --git a/PuReMD-GPU/src/geo_tools.c b/PuReMD-GPU/src/geo_tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..f3c3bd48140f9fd46ac3e460a0ab4bd33ade85b9
--- /dev/null
+++ b/PuReMD-GPU/src/geo_tools.c
@@ -0,0 +1,797 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include <ctype.h>
+
+#include "geo_tools.h"
+#include "allocate.h"
+#include "box.h"
+#include "list.h"
+#include "restart.h"
+#include "tool_box.h"
+#include "vector.h"
+
+
+/********************* geo format routines ******************/
+void Count_Geo_Atoms( FILE *geo, reax_system *system )
+{
+    int i, serial;
+    rvec x;
+    char element[3], name[9], line[MAX_LINE + 1];
+
+    /* total number of atoms */
+    fscanf( geo, " %d", &(system->N) );
+
+    /* count atoms */
+    for ( i = 0; i < system->N; ++i )
+    {
+        fscanf( geo, CUSTOM_ATOM_FORMAT,
+                &serial, element, name, &x[0], &x[1], &x[2] );
+        Fit_to_Periodic_Box( &(system->box), &x );
+    }
+
+    fseek( geo, 0, SEEK_SET ); // set the pointer to the beginning of the file
+    fgets( line, MAX_LINE, geo );
+    fgets( line, MAX_LINE, geo );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "N = %d\n\n", system->N );
+#endif
+}
+
+
+char Read_Geo( char* geo_file, reax_system* system, control_params *control,
+        simulation_data *data, static_storage *workspace )
+{
+
+    FILE *geo;
+    char descriptor[9];
+    int i, serial, top;
+    real box_x, box_y, box_z, alpha, beta, gamma;
+    rvec x;
+    char element[3], name[9];
+    reax_atom *atom;
+
+    /* open the geometry file */
+    if ( (geo = fopen(geo_file, "r")) == NULL )
+    {
+        fprintf( stderr, "Error opening the geo file! terminating...\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* read box information */
+    fscanf( geo, CUSTOM_BOXGEO_FORMAT,
+            descriptor, &box_x, &box_y, &box_z, &alpha, &beta, &gamma );
+    /* initialize the box */
+    Setup_Box( box_x, box_y, box_z, alpha, beta, gamma, &(system->box) );
+
+    /* count my atoms & allocate storage */
+    Count_Geo_Atoms( geo, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* read in my atom info */
+    top = 0;
+    for ( i = 0; i < system->N; ++i )
+    {
+        fscanf( geo, CUSTOM_ATOM_FORMAT,
+                &serial, element, name, &x[0], &x[1], &x[2] );
+        Fit_to_Periodic_Box( &(system->box), &x );
+#if defined(DEBUG)
+        fprintf( stderr, "atom%d: %s %s %f %f %f\n",
+                 serial, element, name, x[0], x[1], x[2] );
+#endif
+
+        atom = &(system->atoms[top]);
+        workspace->orig_id[i] = serial;
+        atom->type = Get_Atom_Type( &(system->reaxprm), element );
+        strcpy( atom->name, name );
+        rvec_Copy( atom->x, x );
+        rvec_MakeZero( atom->v );
+        rvec_MakeZero( atom->f );
+        atom->q = 0.;
+
+        top++;
+    }
+
+    fclose( geo );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "finished reading the geo file\n" );
+#endif
+
+    return SUCCESS;
+}
+
+
+int Read_Box_Info( reax_system *system, FILE *geo, int geo_format )
+{
+    char *cryst;
+    char  line[MAX_LINE + 1];
+    char  descriptor[9];
+    char  s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
+    char  s_group[12], s_zValue[12];
+
+    /* initialize variables */
+    fseek( geo, 0, SEEK_SET ); // set the pointer to the beginning of the file
+
+    switch ( geo_format )
+    {
+        case PDB:
+            cryst = "CRYST1";
+            break;
+        default:
+            cryst = "BOX";
+    }
+
+    /* locate the cryst line in the geo file, read it and
+       initialize the big box */
+    while ( fgets( line, MAX_LINE, geo ) )
+    {
+        if ( strncmp( line, cryst, 6 ) == 0 )
+        {
+            if ( geo_format == PDB )
+                sscanf( line, PDB_CRYST1_FORMAT,
+                        &descriptor[0],
+                        &s_a[0], &s_b[0], &s_c[0],
+                        &s_alpha[0], &s_beta[0], &s_gamma[0],
+                        &s_group[0], &s_zValue[0] );
+
+            /* compute full volume tensor from the angles */
+            Setup_Box( atof(s_a),  atof(s_b), atof(s_c),
+                    atof(s_alpha), atof(s_beta), atof(s_gamma),
+                    &(system->box) );
+            return SUCCESS;
+        }
+    }
+    if ( ferror( geo ) )
+    {
+        return FAILURE;
+    }
+
+    return FAILURE;
+}
+
+
+void Count_PDB_Atoms( FILE *geo, reax_system *system )
+{
+    char *endptr = NULL;
+    char line[MAX_LINE + 1];
+    char s_x[9], s_y[9], s_z[9];
+    rvec x;
+
+    /* initialize variables */
+    fseek( geo, 0, SEEK_SET ); /* set the pointer to the beginning of the file */
+    system->N = 0;
+
+    /* increment number of atoms for each line denoting an atom desc */
+    while ( fgets( line, MAX_LINE, geo ) )
+    {
+        if ( strncmp( line, "ATOM", 4 ) == 0 ||
+                strncmp( line, "HETATM", 6 ) == 0 )
+        {
+            system->N++;
+
+            strncpy( s_x, line + 30, 8 );
+            s_x[8] = 0;
+            strncpy( s_y, line + 38, 8 );
+            s_y[8] = 0;
+            strncpy( s_z, line + 46, 8 );
+            s_z[8] = 0;
+            Make_Point( strtod( s_x, &endptr ), strtod( s_y, &endptr ),
+                        strtod( s_z, &endptr ), &x );
+            Fit_to_Periodic_Box( &(system->box), &x );
+        }
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "count atoms:\n" );
+    fprintf( stderr, "N = %d\n\n", system->N );
+#endif
+}
+
+
+char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
+               simulation_data *data, static_storage *workspace )
+{
+
+    FILE  *pdb;
+    char **tmp;
+    char  *s, *s1;
+    char   descriptor[9], serial[9];
+    char   atom_name[9], res_name[9], res_seq[9];
+    char   s_x[9], s_y[9], s_z[9];
+    char   occupancy[9], temp_factor[9];
+    char   seg_id[9], element[9], charge[9];
+    char   alt_loc, chain_id, icode;
+    char  *endptr = NULL;
+    int    i, c, c1, pdb_serial, top;
+    rvec   x;
+    reax_atom *atom;
+
+    /* open pdb file */
+    if ( (pdb = fopen(pdb_file, "r")) == NULL )
+    {
+        fprintf( stderr, "fopen: error opening the pdb file! terminating...\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* allocate memory for tokenizing pdb lines */
+    if ( Allocate_Tokenizer_Space( &s, &s1, &tmp ) == FAILURE )
+    {
+        fprintf( stderr, "Allocate_Tokenizer_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* read box information */
+    if ( Read_Box_Info( system, pdb, PDB ) == FAILURE )
+    {
+        fprintf( stderr, "Read_Box_Info: no CRYST line in the pdb file!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INVALID_GEO );
+    }
+
+    Count_PDB_Atoms( pdb, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* start reading and processing the pdb file */
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "starting to read the pdb file\n" );
+#endif
+    fseek( pdb, 0, SEEK_SET );
+    c  = 0;
+    c1 = 0;
+    top = 0;
+    s[0] = 0;
+
+    while ( fgets( s, MAX_LINE, pdb ) )
+    {
+        /* read new line and tokenize it */
+        strncpy( s1, s, MAX_LINE - 1 );
+        c1 = Tokenize( s, &tmp );
+
+        /* process new line */
+        if ( strncmp(tmp[0], "ATOM", 4) == 0 || strncmp(tmp[0], "HETATM", 6) == 0 )
+        {
+            if ( strncmp(tmp[0], "ATOM", 4) == 0 )
+            {
+                strncpy( &descriptor[0], s1, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], s1 + 6, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], s1 + 12, 4 );
+                atom_name[4] = 0;
+                //strncpy( &serial[0], s1+6, 7 );       serial[7] = 0;
+                //strncpy( &atom_name[0], s1+13, 3 );   atom_name[3] = 0;
+                alt_loc = s1[16];
+                strncpy( &res_name[0], s1 + 17, 3 );
+                res_name[3] = 0;
+                chain_id = s1[21];
+                strncpy( &res_seq[0], s1 + 22, 4 );
+                res_seq[4] = 0;
+                icode = s1[26];
+                strncpy( &s_x[0], s1 + 30, 8 );
+                s_x[8] = 0;
+                strncpy( &s_y[0], s1 + 38, 8 );
+                s_y[8] = 0;
+                strncpy( &s_z[0], s1 + 46, 8 );
+                s_z[8] = 0;
+                strncpy( &occupancy[0], s1 + 54, 6 );
+                occupancy[6] = 0;
+                strncpy( &temp_factor[0], s1 + 60, 6 );
+                temp_factor[6] = 0;
+                strncpy( &seg_id[0], s1 + 72, 4 );
+                seg_id[4] = 0;
+                strncpy( &element[0], s1 + 76, 2 );
+                element[2] = 0;
+                strncpy( &charge[0], s1 + 78, 2 );
+                charge[2] = 0;
+            }
+            else if (strncmp(tmp[0], "HETATM", 6) == 0)
+            {
+                strncpy( &descriptor[0], s1, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], s1 + 6, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], s1 + 12, 4 );
+                atom_name[4] = 0;
+                //strncpy( &serial[0], s1+6, 7 );       serial[7] = 0;
+                //strncpy( &atom_name[0], s1+13, 3 );   atom_name[3] = 0;
+                alt_loc = s1[16];
+                strncpy( &res_name[0], s1 + 17, 3 );
+                res_name[3] = 0;
+                chain_id = s1[21];
+                strncpy( &res_seq[0], s1 + 22, 4 );
+                res_seq[4] = 0;
+                icode = s1[26];
+                strncpy( &s_x[0], s1 + 30, 8 );
+                s_x[8] = 0;
+                strncpy( &s_y[0], s1 + 38, 8 );
+                s_y[8] = 0;
+                strncpy( &s_z[0], s1 + 46, 8 );
+                s_z[8] = 0;
+                strncpy( &occupancy[0], s1 + 54, 6 );
+                occupancy[6] = 0;
+                strncpy( &temp_factor[0], s1 + 60, 6 );
+                temp_factor[6] = 0;
+                //strncpy( &seg_id[0], s1+72, 4 );      seg_id[4] = 0;
+                strncpy( &element[0], s1 + 76, 2 );
+                element[2] = 0;
+                strncpy( &charge[0], s1 + 78, 2 );
+                charge[2] = 0;
+            }
+
+            /* if the point is inside my_box, add it to my lists */
+            Make_Point( strtod( &s_x[0], &endptr ),
+                        strtod( &s_y[0], &endptr ),
+                        strtod( &s_z[0], &endptr ), &x );
+
+            Fit_to_Periodic_Box( &(system->box), &x );
+
+            /* store orig_id, type, name and coord info of the new atom */
+            atom = &(system->atoms[top]);
+            pdb_serial = (int) strtod( &serial[0], &endptr );
+            workspace->orig_id[top] = pdb_serial;
+
+            Trim_Spaces( element );
+            atom->type = Get_Atom_Type( &(system->reaxprm), element );
+            strcpy( atom->name, atom_name );
+
+            rvec_Copy( atom->x, x );
+            rvec_MakeZero( atom->v );
+            rvec_MakeZero( atom->f );
+            atom->q = 0;
+
+            top++;
+            // fprintf( stderr, "p%d: %6d%2d x:%8.3f%8.3f%8.3f"
+            //                  "q:%8.3f occ:%s temp:%s seg:%s elmnt:%s\n",
+            //       system->my_rank,
+            //       c, system->my_atoms[top].type,
+            //       system->my_atoms[top].x[0],
+            //       system->my_atoms[top].x[1],
+            //       system->my_atoms[top].x[2],
+            //       system->my_atoms[top].q, occupancy, temp_factor,
+            //       seg_id, element );
+
+            //fprintf( stderr, "atom( %8.3f %8.3f %8.3f ) --> p%d\n",
+            // system->my_atoms[top].x[0], system->my_atoms[top].x[1],
+            // system->my_atoms[top].x[2], system->my_rank );
+
+            c++;
+        }
+
+        /* IMPORTANT: We do not check for the soundness of restrictions here.
+           When atom2 is on atom1's restricted list, and there is a restriction
+           on atom2, then atom1 has to be on atom2's restricted list, too.
+           However, we do not check if this is the case in the input file,
+           this is upto the user. */
+        else if (!strncmp( tmp[0], "CONECT", 6 ))
+        {
+            if ( control->restrict_bonds )
+            {
+                /* error check */
+                // Check_Input_Range( c1 - 2, 0, MAX_RESTRICT,
+                // "CONECT line exceeds max num restrictions allowed.\n" );
+
+                /* read bond restrictions */
+                // if( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[1]) ) )
+                //   ratom = workspace->map_serials[ pdb_serial ];
+
+                // workspace->restricted[ ratom ] = c1 - 2;
+                // for( i = 2; i < c1; ++i )
+                //  {
+                //    if( is_Valid_Serial(workspace, pdb_serial = atoi(tmp[i])) )
+                //        workspace->restricted_list[ ratom ][ i-2 ] =
+                //          workspace->map_serials[ pdb_serial ];
+                //  }
+
+                // fprintf( stderr, "restriction on %d:", ratom );
+                // for( i = 0; i < workspace->restricted[ ratom ]; ++i )
+                // fprintf( stderr, "  %d",
+                //          workspace->restricted_list[ratom][i] );
+                // fprintf( stderr, "\n" );
+            }
+        }
+
+        /* clear previous input line */
+        s[0] = 0;
+        for ( i = 0; i < c1; ++i )
+            tmp[i][0] = 0;
+    }
+    if ( ferror( pdb ) )
+    {
+        return FAILURE;
+    }
+
+    fclose( pdb );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "finished reading the pdb file\n" );
+#endif
+
+    return SUCCESS;
+} 
+
+
+/* PDB serials are written without regard to the order, we'll see if this
+   cause trouble, if so we'll have to rethink this approach
+   Also, we do not write connect lines yet.
+*/
+char Write_PDB( reax_system* system, list* bonds, simulation_data *data,
+        control_params *control, static_storage *workspace, output_controls *out_control )
+{
+    int i, buffer_req, buffer_len;
+    //int j, connect[4];
+    char name[8];
+    //real bo;
+    real alpha, beta, gamma;
+    reax_atom *p_atom;
+    char fname[MAX_STR];
+    char *line;
+    char *buffer;
+    FILE *pdb;
+
+    /* Allocation */
+    line = (char*) smalloc( sizeof(char) * PDB_ATOM_FORMAT_O_LENGTH, "geo:line" );
+    buffer_req = system->N * PDB_ATOM_FORMAT_O_LENGTH;
+
+    buffer = (char*) smalloc( sizeof(char) * buffer_req, "geo:buffer" );
+
+    pdb = NULL;
+    line[0] = 0;
+    buffer[0] = 0;
+    /* Writing Box information */
+    gamma = ACOS( (system->box.box[0][0] * system->box.box[1][0] +
+                   system->box.box[0][1] * system->box.box[1][1] +
+                   system->box.box[0][2] * system->box.box[1][2]) /
+                  (system->box.box_norms[0] * system->box.box_norms[1]) );
+    beta  = ACOS( (system->box.box[0][0] * system->box.box[2][0] +
+                   system->box.box[0][1] * system->box.box[2][1] +
+                   system->box.box[0][2] * system->box.box[2][2]) /
+                  (system->box.box_norms[0] * system->box.box_norms[2]) );
+    alpha = ACOS( (system->box.box[2][0] * system->box.box[1][0] +
+                   system->box.box[2][1] * system->box.box[1][1] +
+                   system->box.box[2][2] * system->box.box[1][2]) /
+                  (system->box.box_norms[2] * system->box.box_norms[1]) );
+
+    /*open pdb and write header*/
+    sprintf(fname, "%s-%d.pdb", control->sim_name, data->step);
+    pdb = fopen(fname, "w");
+    fprintf( pdb, PDB_CRYST1_FORMAT_O,
+             "CRYST1",
+             system->box.box_norms[0], system->box.box_norms[1],
+             system->box.box_norms[2],
+             RAD2DEG(alpha), RAD2DEG(beta), RAD2DEG(gamma), " ", 0 );
+    fprintf( out_control->log, "Box written\n" );
+    fflush( out_control->log );
+
+    /*write atom lines to buffer*/
+    for ( i = 0; i < system->N; i++)
+    {
+        p_atom = &(system->atoms[i]);
+        strncpy(name, p_atom->name, 8);
+        Trim_Spaces(name);
+        sprintf( line, PDB_ATOM_FORMAT_O,
+                 "ATOM  ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ',
+                 p_atom->x[0], p_atom->x[1], p_atom->x[2],
+                 1.0, 0.0, "0", name, "  " );
+        fprintf( stderr, "PDB NAME <%s>\n", p_atom->name );
+        strncpy( buffer + i * PDB_ATOM_FORMAT_O_LENGTH, line,
+                 PDB_ATOM_FORMAT_O_LENGTH );
+    }
+
+    buffer_len = system->N * PDB_ATOM_FORMAT_O_LENGTH;
+    buffer[buffer_len] = 0;
+
+    fprintf( pdb, "%s", buffer );
+    fclose( pdb );
+
+    /* Writing connect information */
+    /*
+    for(i=0; i < system->N; i++) {
+      count = 0;
+      for(j = Start_Index(i, bonds); j < End_Index(i, bonds); ++j) {
+        bo = bonds->bond_list[j].bo_data.BO;
+        if (bo > 0.3) {
+          connect[count] = bonds->bond_list[j].nbr+1;
+          count++;
+        }
+      }
+
+      fprintf( out_control->pdb, "%6s%5d", "CONECT", i+1 );
+      for( k=0; k < count; k++ )
+        fprintf( out_control->pdb, "%5d", connect[k] );
+      fprintf( out_control->pdb, "\n" );
+    }
+    */
+
+    free(buffer);
+    free(line);
+
+    return SUCCESS;
+}
+
+
+char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
+               simulation_data *data, static_storage *workspace )
+{
+    FILE *bgf;
+    char **tokens;
+    char *line, *backup;
+    char descriptor[10], serial[10];
+    char atom_name[10], res_name[10], res_seq[10];
+    char s_x[12], s_y[12], s_z[12];
+    char occupancy[10], temp_factor[10];
+    char element[10], charge[10];
+    char chain_id;
+    char s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
+    char *endptr = NULL;
+    int  i, atom_cnt, token_cnt, bgf_serial, ratom = 0;
+
+    /* open biograf file */
+    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the bgf file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+
+    /* allocate memory for tokenizing biograf file lines */
+    line   = (char*)  malloc( sizeof(char)  * MAX_LINE );
+    backup = (char*)  malloc( sizeof(char)  * MAX_LINE );
+    tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS );
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        tokens[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
+    }
+
+    /* count number of atoms in the pdb file */
+    system->N = 0;
+    line[0] = 0;
+
+    while ( fgets( line, MAX_LINE, bgf ) )
+    {
+        tokens[0][0] = 0;
+        token_cnt = Tokenize( line, &tokens );
+
+        if ( !strcmp( tokens[0], "ATOM" ) || !strcmp( tokens[0], "HETATM" ) )
+        {
+            (system->N)++;
+        }
+
+        line[0] = 0;
+    }
+    if ( ferror ( bgf ) )
+    {
+        return FAILURE;
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "system->N: %d\n", system->N );
+#endif
+
+    fclose( bgf );
+
+    /* memory allocations for atoms, atom maps, bond restrictions */
+//    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+//
+//    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
+//    for ( i = 0; i < MAX_ATOM_ID; ++i )
+//    {
+//        workspace->map_serials[i] = -1;
+//    }
+//
+//    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
+//    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
+//    workspace->restricted_list = (int**) calloc( system->N, sizeof(int*) );
+//    for ( i = 0; i < system->N; ++i )
+//    {
+//        workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
+//    }
+
+    //TODO: setup similar for BGF
+//    Count_PDB_Atoms( pdb, system );
+    if ( PreAllocate_Space( system, control, workspace ) == FAILURE )
+    {
+        fprintf( stderr, "PreAllocate_Space: not enough memory!" );
+        fprintf( stderr, "terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* start reading and processing bgf file */
+    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the bgf file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+    atom_cnt = 0;
+    token_cnt = 0;
+
+    while ( fgets( line, MAX_LINE, bgf ) )
+    {
+        /* read new line and tokenize it */
+        strncpy( backup, line, MAX_LINE - 1 );
+        token_cnt = Tokenize( line, &tokens );
+
+        /* process new line */
+        if ( !strncmp(tokens[0], "ATOM", 4) || !strncmp(tokens[0], "HETATM", 6) )
+        {
+            if ( !strncmp(tokens[0], "ATOM", 4) )
+            {
+                strncpy( &descriptor[0], backup, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], backup + 7, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], backup + 13, 5 );
+                atom_name[5] = 0;
+                strncpy( &res_name[0], backup + 19, 3 );
+                res_name[3] = 0;
+                chain_id = backup[23];
+                strncpy( &res_seq[0], backup + 25, 5 );
+                res_seq[5] = 0;
+                strncpy( &s_x[0], backup + 30, 10 );
+                s_x[10] = 0;
+                strncpy( &s_y[0], backup + 40, 10 );
+                s_y[10] = 0;
+                strncpy( &s_z[0], backup + 50, 10 );
+                s_z[10] = 0;
+                strncpy( &element[0], backup + 61, 5 );
+                element[5] = 0;
+                strncpy( &occupancy[0], backup + 66, 3 );
+                occupancy[3] = 0;
+                strncpy( &temp_factor[0], backup + 69, 2 );
+                temp_factor[2] = 0;
+                strncpy( &charge[0], backup + 72, 8 );
+                charge[8] = 0;
+            }
+            else if ( !strncmp(tokens[0], "HETATM", 6) )
+            {
+                /* bgf hetatm:
+                   (7x,i5,1x,a5,1x,a3,1x,a1,1x,a5,3f10.5,1x,a5,i3,i2,1x,f8.5) */
+                strncpy( &descriptor[0], backup, 6 );
+                descriptor[6] = 0;
+                strncpy( &serial[0], backup + 7, 5 );
+                serial[5] = 0;
+                strncpy( &atom_name[0], backup + 13, 5 );
+                atom_name[5] = 0;
+                strncpy( &res_name[0], backup + 19, 3 );
+                res_name[3] = 0;
+                chain_id = backup[23];
+                strncpy( &res_seq[0], backup + 25, 5 );
+                res_seq[5] = 0;
+                strncpy( &s_x[0], backup + 30, 10 );
+                s_x[10] = 0;
+                strncpy( &s_y[0], backup + 40, 10 );
+                s_y[10] = 0;
+                strncpy( &s_z[0], backup + 50, 10 );
+                s_z[10] = 0;
+                strncpy( &element[0], backup + 61, 5 );
+                element[5] = 0;
+                strncpy( &occupancy[0], backup + 66, 3 );
+                occupancy[3] = 0;
+                strncpy( &temp_factor[0], backup + 69, 2 );
+                temp_factor[2] = 0;
+                strncpy( &charge[0], backup + 72, 8 );
+                charge[8] = 0;
+            }
+
+            /* add to mapping */
+            bgf_serial = strtod( &serial[0], &endptr );
+            Check_Input_Range( bgf_serial, 0, MAX_ATOM_ID, "Invalid bgf serial" );
+            workspace->map_serials[ bgf_serial ] = atom_cnt;
+            workspace->orig_id[ atom_cnt ] = bgf_serial;
+            // fprintf( stderr, "map %d --> %d\n", bgf_serial, atom_cnt );
+
+            /* copy atomic positions */
+            system->atoms[atom_cnt].x[0] = strtod( &s_x[0], &endptr );
+            system->atoms[atom_cnt].x[1] = strtod( &s_y[0], &endptr );
+            system->atoms[atom_cnt].x[2] = strtod( &s_z[0], &endptr );
+
+            /* atom name and type */
+            strcpy( system->atoms[atom_cnt].name, atom_name );
+            Trim_Spaces( element );
+            system->atoms[atom_cnt].type =
+                Get_Atom_Type( &(system->reaxprm), element );
+
+            /* fprintf( stderr,
+            "a:%3d(%1d) c:%10.5f%10.5f%10.5f q:%10.5f occ:%s temp:%s seg_id:%s element:%s\n",
+             atom_cnt, system->atoms[ atom_cnt ].type,
+             system->atoms[ atom_cnt ].x[0],
+             system->atoms[ atom_cnt ].x[1], system->atoms[ atom_cnt ].x[2],
+             system->atoms[ atom_cnt ].q, occupancy, temp_factor,
+             seg_id, element ); */
+
+            atom_cnt++;
+        }
+        else if (!strncmp( tokens[0], "CRYSTX", 6 ))
+        {
+            sscanf( backup, BGF_CRYSTX_FORMAT,
+                    &descriptor[0],
+                    &s_a[0],
+                    &s_b[0],
+                    &s_c[0],
+                    &s_alpha[0],
+                    &s_beta[0],
+                    &s_gamma[0] );
+
+            /* Compute full volume tensor from the angles */
+            Setup_Box( atof(s_a),  atof(s_b), atof(s_c),
+                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
+                                 &(system->box) );
+        }
+        else if (!strncmp( tokens[0], "CONECT", 6 ))
+        {
+            /* check number of restrictions */
+            Check_Input_Range( token_cnt - 2, 0, MAX_RESTRICT,
+                               "CONECT line exceeds max restrictions allowed.\n" );
+
+            /* read bond restrictions */
+            if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[1]) ) )
+            {
+                ratom = workspace->map_serials[ bgf_serial ];
+            }
+
+            workspace->restricted[ ratom ] = token_cnt - 2;
+            for ( i = 2; i < token_cnt; ++i )
+            {
+                if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[i]) ) )
+                {
+                    workspace->restricted_list[ ratom * system->N + (i - 2) ] =
+                        workspace->map_serials[ bgf_serial ];
+                }
+            }
+
+            /* fprintf( stderr, "restriction on %d:", ratom );
+            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
+             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
+             fprintf( stderr, "\n" ); */
+        }
+
+        /* clear previous input line */
+        line[0] = 0;
+
+        for ( i = 0; i < token_cnt; ++i )
+        {
+            tokens[i][0] = 0;
+        }
+    }
+    if ( ferror ( bgf ) )
+    {
+        return FAILURE;
+    }
+
+    fclose( bgf );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "bgf file read\n" );
+#endif
+
+    return SUCCESS;
+}
diff --git a/PuReMD-GPU/src/pdb_tools.h b/PuReMD-GPU/src/geo_tools.h
similarity index 84%
rename from PuReMD-GPU/src/pdb_tools.h
rename to PuReMD-GPU/src/geo_tools.h
index 12518fc2daaecd735f5cb0f781a0f1c72e504aa7..4c44e3081e6105947b610916e95210e123b4a7d9 100644
--- a/PuReMD-GPU/src/pdb_tools.h
+++ b/PuReMD-GPU/src/geo_tools.h
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -18,13 +19,20 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __PDB_TOOLS_H_
-#define __PDB_TOOLS_H_
+#ifndef __GEO_TOOLS_H_
+#define __GEO_TOOLS_H_
 
 #include "mytypes.h"
 
-/*
-PDB format :
+// CUSTOM_BOXGEO: BOXGEO box_x box_y box_z  angle1 angle2 angle3
+#define CUSTOM_BOXGEO_FORMAT " %s %lf %lf %lf %lf %lf %lf"
+// CUSTOM ATOM: serial element name x y z
+#define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf"
+
+char Read_Geo( char*, reax_system*, control_params*,
+        simulation_data*, static_storage* );
+
+/* PDB format :
 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html
 
 #define PDB_ATOM_FORMAT   "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%4s%2s%2s\n"
@@ -94,24 +102,28 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 67 - 70      Integer         z             Z value
 */
 
-//#define PDB_ATOM_FORMAT "ATOM  %4d%4s%c%3s%c%4d%c%8.3f%8.3f%8.3f%6.2f%6.2f%-4s%2s%2s\n"
+//#define PDB_ATOM_FORMAT
+//"ATOM  %4d%4s%c%3s%c%4d%c%8.3f%8.3f%8.3f%6.2f%6.2f%-4s%2s%2s\n"
 
 #define PDB_ATOM_FORMAT   "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%4s%2s%2s\n"
+#define PDB_ATOM_FORMAT_LENGTH 71
 #define PDB_HETATM_FORMAT "%6s%5d%4s%c%4s%c%4d%c%8s%8s%8s%6s%6s%2s%2s\n"
 #define PDB_CONECT_FORMAT "%6s%5d%5d%5d%5d%5d\n"
 #define PDB_CRYST1_FORMAT "%6s%9s%9s%9s%7s%7s%7s%11s%4s\n"
 
 #define PDB_ATOM_FORMAT_O "%6s%5d %4s%c%3s %c%4d%c   %8.3f%8.3f%8.3f%6.2f%6.2f      %-4s%2s%2s\n"
+#define PDB_ATOM_FORMAT_O_LENGTH 81
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
 #define BGF_CRYSTX_FORMAT "%8s%11s%11s%11s%11s%11s%11s"
 
 char Read_PDB( char*, reax_system*, control_params*,
-               simulation_data*, static_storage* );
+        simulation_data*, static_storage* );
+
 char Read_BGF( char*, reax_system*, control_params*,
-               simulation_data*, static_storage* );
+        simulation_data*, static_storage* );
 
-char Write_PDB( reax_system*, control_params*, simulation_data*,
-                static_storage*, list*, output_controls* );
+char Write_PDB( reax_system*, list*, simulation_data*,
+        control_params*, static_storage*, output_controls* );
 
 #endif
diff --git a/PuReMD-GPU/src/grid.c b/PuReMD-GPU/src/grid.c
index fb09b409194a84b1646da3b779aad8b547ff9db3..2077d56080799f3674eecb3038a0f30bd318bd67 100644
--- a/PuReMD-GPU/src/grid.c
+++ b/PuReMD-GPU/src/grid.c
@@ -1,28 +1,29 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "grid.h"
 
+#include "index_utils.h"
 #include "reset_utils.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
 int Estimate_GCell_Population( reax_system* system )
@@ -34,36 +35,45 @@ int Estimate_GCell_Population( reax_system* system )
     g = &( system->g );
     Reset_Grid( g );
 
-    for( l = 0; l < system->N; l++ ) {
+    for ( l = 0; l < system->N; l++ )
+    {
         i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
         j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
         k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
         g->top[index_grid_3d (i, j, k, g)]++;
-        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
+        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n",
         // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
         // i, j, k );
     }
 
     max_atoms = 0;
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ )
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 if( max_atoms < g->top[index_grid_3d (i, j, k, g)] )
+                {
                     max_atoms = g->top[index_grid_3d (i, j, k, g)];  
+                }
+            }
+        }
+    }
 
-    return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); 
+    return MAX(max_atoms * SAFE_ZONE, MIN_GCELL_POPL);
 }
 
 
 void Allocate_Space_for_Grid( reax_system *system )
 {
     int i, j, k, l;
-    grid *g = &(system->g);
-
-    int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
+    grid *g;
+    int total;
 
     g = &(system->g);
-    g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; 
+    g->max_nbrs = (2 * g->spread[0] + 1) * (2 * g->spread[1] + 1) * (2 * g->spread[2] + 1) + 3;
+    total = g->ncell[0] * g->ncell[1] * g->ncell[2];
 
     /* allocate space for the new grid */
     g->top = (int*) calloc( total, sizeof( int ));
@@ -73,10 +83,14 @@ void Allocate_Space_for_Grid( reax_system *system )
     g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec ));
     g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec ));
 
-    for( i = 0; i < g->ncell[0]; i++ ) {
-        for( j = 0; j < g->ncell[1]; j++ ) {
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                for( l = 0; l < g->max_nbrs; ++l ){ 
+    for( i = 0; i < g->ncell[0]; i++ )
+    {
+        for( j = 0; j < g->ncell[1]; j++ )
+        {
+            for( k = 0; k < g->ncell[2]; k++ )
+            {
+                for( l = 0; l < g->max_nbrs; ++l )
+                { 
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
                     g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
@@ -110,49 +124,74 @@ int Shift(int p, int dp, int dim, grid *g )
     int dim_len = 0;
     int newp = p + dp;
 
-    switch( dim ) {
-        case 0: dim_len = g->ncell[0];
-            break;
-        case 1: dim_len = g->ncell[1];
-            break;
-        case 2: dim_len = g->ncell[2];
+    switch ( dim )
+    {
+    case 0:
+        dim_len = g->ncell[0];
+        break;
+    case 1:
+        dim_len = g->ncell[1];
+        break;
+    case 2:
+        dim_len = g->ncell[2];
+    }
+
+    while ( newp < 0 )
+    {
+        newp = newp + dim_len;
+    }
+    while ( newp >= dim_len )
+    {
+        newp = newp - dim_len;
     }
 
-    while( newp < 0 )        newp = newp + dim_len;
-    while( newp >= dim_len ) newp = newp - dim_len;
     return newp;
 }
 
 
 /* finds the closest point between two grid cells denoted by c1 and c2.
    periodic boundary conditions are taken into consideration as well. */
-void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z, 
-        int c2x, int c2y, int c2z, rvec closest_point )
+void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z,
+                         int c2x, int c2y, int c2z, rvec closest_point )
 {
     int  i, d;
     ivec c1 = { c1x, c1y, c1z };
     ivec c2 = { c2x, c2y, c2z };
 
-    for( i = 0; i < 3; i++ ) {
-        if( g->ncell[i] < 5 ) {
+    for ( i = 0; i < 3; i++ )
+    {
+        if ( g->ncell[i] < 5 )
+        {
             closest_point[i] = NEG_INF - 1.;
             continue;
         }
 
         d = c2[i] - c1[i];
-        if( abs(d) <= g->ncell[i] / 2 ) {
-            if( d > 0 )
+        if ( abs(d) <= g->ncell[i] / 2 )
+        {
+            if ( d > 0 )
+            {
                 closest_point[i] = c2[i] * g->len[i];
+            }
             else if ( d == 0 )
+            {
                 closest_point[i] = NEG_INF - 1.;
+            }
             else
+            {
                 closest_point[i] = ( c2[i] + 1 ) * g->len[i];
+            }
         }
-        else {
-            if( d > 0 )
+        else
+        {
+            if ( d > 0 )
+            {
                 closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i];
-            else    
+            }
+            else
+            {
                 closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i];
+            }
         }
     }
 }
@@ -168,29 +207,36 @@ void Find_Neighbor_GridCells( grid *g )
     rvec *cp_stack;
 
     /* pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
                 cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
                 stack_top = 0;
                 //fprintf( stderr, "grid1: %d %d %d\n", i, j, k );
 
                 /* choose an unmarked neighbor cell*/
-                for( di = -g->spread[0]; di <= g->spread[0]; di++ ) {
+                for ( di = -g->spread[0]; di <= g->spread[0]; di++ )
+                {
                     x = Shift( i, di, 0, g );
 
-                    for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) {
+                    for ( dj = -g->spread[1]; dj <= g->spread[1]; dj++ )
+                    {
                         y = Shift( j, dj, 1, g );
 
-                        for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) {
+                        for ( dk = -g->spread[2]; dk <= g->spread[2]; dk++ )
+                        {
                             z = Shift( k, dk, 2, g );
                             //fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z );
 
-                            if( !g->mark[ index_grid_3d (x, y, z, g) ] ) {
+                            if( !g->mark[ index_grid_3d (x, y, z, g) ] )
+                            {
                                 /*(di < 0 || // 9 combinations
-                                  (di == 0 && dj < 0) || // 3 combinations
-                                  (di == 0 && dj == 0 && dk < 0) ) )*/ 
+                                 (di == 0 && dj < 0) || // 3 combinations
+                                 (di == 0 && dj == 0 && dk < 0) ) )*/
                                 /* put the neighbor cell into the stack and mark it */
                                 nbrs_stack[stack_top][0] = x;
                                 nbrs_stack[stack_top][1] = y;
@@ -198,8 +244,8 @@ void Find_Neighbor_GridCells( grid *g )
                                 g->mark[ index_grid_3d(x,y,z,g) ] = 1;
 
                                 Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] );
-                                //fprintf( stderr, "\tcp: %lf %lf %lf\n", 
-                                // cp_stack[stack_top][0], cp_stack[stack_top][1], 
+                                //fprintf( stderr, "\tcp: %lf %lf %lf\n",
+                                // cp_stack[stack_top][0], cp_stack[stack_top][1],
                                 // cp_stack[stack_top][2]);
                                 stack_top++;
                             }
@@ -220,6 +266,8 @@ void Find_Neighbor_GridCells( grid *g )
                 nbrs_stack[stack_top][2] = -1;
                 Reset_Marks( g, nbrs_stack, stack_top );
             }
+        }
+    }
 }
 
 
@@ -234,9 +282,13 @@ void Setup_Grid( reax_system* system )
     /* determine number of grid cells in each direction */
     ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
 
-    for( d = 0; d < 3; ++d )
-        if( ncell[d] <= 0 )
+    for ( d = 0; d < 3; ++d )
+    {
+        if ( ncell[d] <= 0 )
+        {
             ncell[d] = 1;
+        }
+    }
 
     /* find the number of grid cells */
     g->total = ncell[0] * ncell[1] * ncell[2];
@@ -270,25 +322,34 @@ void Update_Grid( reax_system* system )
     /* determine number of grid cells in each direction */
     ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
 
-    for( d = 0; d < 3; ++d )
-        if( ncell[d] == 0 )
+    for ( d = 0; d < 3; ++d )
+    {
+        if ( ncell[d] == 0 )
+        {
             ncell[d] = 1;
+        }
+    }
 
-    if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */
+    if ( ivec_isEqual( ncell, g->ncell ) ) /* ncell are unchanged */
+    {
         /* update cell lengths */
         rvec_iDivide( g->len, my_box->box_norms, g->ncell );
         rvec_Invert( g->inv_len, g->len );
 
         /* update closest point distances between gcells */
-        for( i = 0; i < g->ncell[0]; i++ )
-            for( j = 0; j < g->ncell[1]; j++ )
-                for( k = 0; k < g->ncell[2]; k++ ) {
+        for ( i = 0; i < g->ncell[0]; i++ )
+        {
+            for ( j = 0; j < g->ncell[1]; j++ )
+            {
+                for ( k = 0; k < g->ncell[2]; k++ )
+                {
                     nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
                     nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
                     //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
@@ -297,9 +358,12 @@ void Update_Grid( reax_system* system )
                         ++itr;
                     }
                 }
+            }
+        }
     }
-    else{  /* at least one of ncell has changed */
-        Deallocate_Grid_Space( g );    
+    else   /* at least one of ncell has changed */
+    {
+        Deallocate_Grid_Space( g );
         /* update number of grid cells */
         g->total = ncell[0] * ncell[1] * ncell[2];
         ivec_Copy( g->ncell, ncell );
@@ -311,10 +375,10 @@ void Update_Grid( reax_system* system )
         Find_Neighbor_GridCells( g );
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "updated grid: " );
-        fprintf( stderr, "ncell[%d %d %d] ", 
-                g->ncell[0], g->ncell[1], g->ncell[2] );
-        fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", 
-                g->len[0], g->len[1], g->len[2] );
+        fprintf( stderr, "ncell[%d %d %d] ",
+                 g->ncell[0], g->ncell[1], g->ncell[2] );
+        fprintf( stderr, "len[%5.2f %5.2f %5.2f] ",
+                 g->len[0], g->len[1], g->len[2] );
         fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
 #endif
     }
@@ -328,40 +392,59 @@ void Bin_Atoms( reax_system* system, static_storage *workspace )
     grid *g = &( system->g );
 
     Reset_Grid( g );
-
-    for( l = 0; l < system->N; l++ ) {
+    for ( l = 0; l < system->N; l++ )
+    {
         i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
         j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
         k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
 
 #ifdef __BNVT_FIX__
-        if (i >= g->ncell[0]) i = g->ncell[0]-1;
-        if (j >= g->ncell[1]) j = g->ncell[1]-1;
-        if (k >= g->ncell[2]) k = g->ncell[2]-1;
+        if (i >= g->ncell[0])
+        {
+            i = g->ncell[0]-1;
+        }
+        if (j >= g->ncell[1])
+        {
+            j = g->ncell[1]-1;
+        }
+        if (k >= g->ncell[2])
+        {
+            k = g->ncell[2]-1;
+        }
 #endif
 
         g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l;
         g->top[index_grid_3d (i,j,k,g) ]++;
 
-        //fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
-        //l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
-        //i, j, k );
+        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n",
+        // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
+        // i, j, k );
     }
 
     max_atoms = 0;
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ )
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] )
+                {
                     max_atoms = g->top[ index_grid_3d (i, j, k, g) ];  
+                }
+            }
+        }
+    }
 
     /* check if current gcell->max_atoms is safe */
-    if( max_atoms >= g->max_atoms * SAFE_ZONE ) 
-        workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); 
+    if ( max_atoms >= g->max_atoms * SAFE_ZONE )
+    {
+        workspace->realloc.gcell_atoms = MAX(max_atoms * SAFE_ZONE, MIN_GCELL_POPL);
+    }
 }
 
 
-inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
+static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 {
     dest->type = src->type;
     rvec_Copy( dest->x, src->x );
@@ -370,30 +453,37 @@ inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 }
 
 
-void Copy_Storage( reax_system *system, static_storage *workspace, 
-        int top, int old_id, int old_type, 
-        int *num_H, real *v, real *s, real *t, 
-        int *orig_id, rvec *f_old )
+void Copy_Storage( reax_system *system, static_storage *workspace,
+                   int top, int old_id, int old_type,
+                   int *num_H, real *v, real *s, real *t,
+                   int *orig_id, rvec *f_old )
 {
     int i;
 
-    for( i = 0; i < RESTART+1; ++i )
+    for ( i = 0; i < RESTART + 1; ++i )
+    {
         v[ index_wkspace_sys (i,top, system->N) ] = workspace->v[ index_wkspace_sys (i,old_id, system->N) ];
+    }
 
-    for( i = 0; i < 3; ++i ) {
-        s[ index_wkspace_sys (i,top, system->N) ] = workspace->s[ index_wkspace_sys (i,old_id, system->N) ];
-        t[ index_wkspace_sys (i,top, system->N) ] = workspace->t[ index_wkspace_sys (i,old_id, system->N) ];
+    for ( i = 0; i < 3; ++i )
+    {
+        s[ index_wkspace_sys(i,top, system->N) ] = workspace->s[ index_wkspace_sys(i,old_id, system->N) ];
+        t[ index_wkspace_sys(i,top, system->N) ] = workspace->t[ index_wkspace_sys(i,old_id, system->N) ];
     }
 
     orig_id[top]  = workspace->orig_id[old_id];
 
-    workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta;
     workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi;
-    workspace->b_t[top] = -1.0;          
+    workspace->b_t[top] = -1.0;
 
-    if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
+    if ( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
+    {
         workspace->hbond_index[top] = (*num_H)++;
-    else workspace->hbond_index[top] = -1;
+    }
+    else
+    {
+        workspace->hbond_index[top] = -1;
+    }
 
     rvec_Copy( f_old[top], workspace->f_old[old_id] );
 }
@@ -404,12 +494,12 @@ void Free_Storage( static_storage *workspace )
     free( workspace->v );
     free( workspace->s );
     free( workspace->t );
-    free( workspace->orig_id );  
+    free( workspace->orig_id );
 }
 
 
-void Assign_New_Storage( static_storage *workspace, 
-        real *v, real *s, real *t, 
+void Assign_New_Storage( static_storage *workspace,
+        real *v, real *s, real *t,
         int *orig_id, rvec *f_old )
 {
     workspace->v = v;
@@ -425,14 +515,20 @@ void Assign_New_Storage( static_storage *workspace,
 
 void Cluster_Atoms( reax_system *system, static_storage *workspace )
 {
-    int         i, j, k, l, top, old_id, num_H = 0;
+    int         i, j, k, l, top, old_id, num_H;
     reax_atom  *old_atom;
-    grid       *g = &( system->g );
-    reax_atom  *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-    int        *orig_id = (int  *) calloc( system->N, sizeof( int ) );
+    grid       *g;
+    reax_atom  *new_atoms;
+    int        *orig_id ;
     real       *v;
     real       *s, *t;
-    rvec       *f_old = (rvec*) calloc( system->N, sizeof(rvec) );
+    rvec       *f_old;
+
+    num_H = 0;
+    g = &( system->g );
+    new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    orig_id = (int  *) calloc( system->N, sizeof( int ) );
+    f_old = (rvec*) calloc( system->N, sizeof(rvec) );
 
     s = (real*) calloc( 3, sizeof( real ) * system->N );
     t = (real*) calloc( 3, sizeof( real ) * system->N );
@@ -440,24 +536,30 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace )
 
     top = 0;
 
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 g->start[ index_grid_3d (i, j, k, g) ] = top;
 
-                for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) {
+                for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l )
+                {
                     old_id   = g->atoms[ index_grid_atoms (i, j, k, l, g) ];
                     old_atom = &( system->atoms[old_id] );
                     // fprintf( stderr, "%d <-- %d\n", top, old_id );
 
                     reax_atom_Copy( &(new_atoms[top]), old_atom );
-                    Copy_Storage( system, workspace, top, old_id, old_atom->type, 
-                            &num_H, v, s, t, orig_id, f_old );
+                    Copy_Storage( system, workspace, top, old_id, old_atom->type,
+                                  &num_H, v, s, t, orig_id, f_old );
                     ++top;
                 }
 
                 g->end[ index_grid_3d (i, j, k, g) ] = top;
             }
+        }
+    }
 
 
     free( system->atoms );
diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c
index 2a2ce1270e2c694722e489b9a3f38f8dd48177a1..0f873f440456316f86c038ad31098f966ad17aec 100644
--- a/PuReMD-GPU/src/init_md.c
+++ b/PuReMD-GPU/src/init_md.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -25,8 +26,8 @@
 #include "forces.h"
 #include "grid.h"
 #include "index_utils.h"
-#include "lin_alg.h"
 #include "integrate.h"
+#include "lin_alg.h"
 #include "neighbors.h"
 #include "list.h"
 #include "lookup.h"
@@ -34,21 +35,20 @@
 #include "reset_utils.h"
 #include "system_props.h"
 #include "traj.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-void Generate_Initial_Velocities(reax_system *system, real T )
+void Generate_Initial_Velocities( reax_system *system, real T )
 {
     int i;
     real scale, norm;
 
 
-    if( T <= 0.1 )
+    if ( T <= 0.1 )
     {
-        for ( i = 0; i < system->N; i++ )
-        {
+        for (i = 0; i < system->N; i++)
             rvec_MakeZero( system->atoms[i].v );
-        }
 
 #if defined(DEBUG)
         fprintf( stderr, "no random velocities...\n" );
@@ -56,73 +56,74 @@ void Generate_Initial_Velocities(reax_system *system, real T )
     }
     else
     {
-        for( i = 0; i < system->N; i++ )
+        for ( i = 0; i < system->N; i++ )
         {
             rvec_Random( system->atoms[i].v );
 
             norm = rvec_Norm_Sqr( system->atoms[i].v );
-            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
-                    norm / (3.0 * K_B * T) );
+            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass *
+                          norm / (3.0 * K_B * T) );
 
-            rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
+            rvec_Scale( system->atoms[i].v, 1.0 / scale, system->atoms[i].v );
 
-            /*
-               fprintf( stderr, "v = %f %f %f\n", 
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-               fprintf( stderr, "scale = %f\n", scale );
-               fprintf( stderr, "v = %f %f %f\n",
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-             */
+            /*fprintf( stderr, "v = %f %f %f\n",
+            system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+            fprintf( stderr, "scale = %f\n", scale );
+            fprintf( stderr, "v = %f %f %f\n",
+            system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);*/
         }
     }
 }
 
 
-void Init_System( reax_system *system, control_params *control, 
+void Init_System( reax_system *system, control_params *control,
         simulation_data *data )
 {
     int i;
     rvec dx;
 
-    if( !control->restart )
+    if ( !control->restart )
     {
         Reset_Atoms( system );
     }
 
     Compute_Total_Mass( system, data );
-
     Compute_Center_of_Mass( system, data, stderr );
 
     /* reposition atoms */
     // just fit the atoms to the periodic box
-    if( control->reposition_atoms == 0 )
+    if ( control->reposition_atoms == 0 )
     {
         rvec_MakeZero( dx );
     }
     // put the center of mass to the center of the box
-    else if( control->reposition_atoms == 1 )
+    else if ( control->reposition_atoms == 1 )
     {
         rvec_Scale( dx, 0.5, system->box.box_norms );
         rvec_ScaledAdd( dx, -1., data->xcm );
     }
     // put the center of mass to the origin
-    else if( control->reposition_atoms == 2 ) {
+    else if ( control->reposition_atoms == 2 )
+    {
         rvec_Scale( dx, -1., data->xcm );
     }
-    else {
+    else
+    {
         fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
         exit( UNKNOWN_OPTION );
     }
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
-          i, system->atoms[i].type, 
+        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n",
+          i, system->atoms[i].type,
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
     }
 
     /* Initialize velocities so that desired init T can be attained */
-    if( !control->restart || (control->restart && control->random_vel) )  {
+    if ( !control->restart || (control->restart && control->random_vel) )
+    {
         Generate_Initial_Velocities( system, control->T_init );
     }
 
@@ -130,96 +131,153 @@ void Init_System( reax_system *system, control_params *control,
 }
 
 
-void Init_Simulation_Data( reax_system *system, control_params *control, 
-        simulation_data *data, output_controls *out_control, 
-        evolve_function *Evolve )
+void Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, output_controls *out_control, evolve_function *Evolve )
 {
 
     Reset_Simulation_Data( data );
 
-    if( !control->restart )  
+    if ( !control->restart )
+    {
         data->step = data->prev_steps = 0;
+    }
 
-    switch( control->ensemble ) {
-        case NVE:
-            data->N_f = 3 * system->N;
-            *Evolve = Velocity_Verlet_NVE;
-            break;
+    switch ( control->ensemble )
+    {
+    case NVE:
+        data->N_f = 3 * system->N;
+        *Evolve = Velocity_Verlet_NVE;
+        break;
 
 
-        case NVT:
-            data->N_f = 3 * system->N + 1;
-            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->therm.v_xi_old = 0;
-                data->therm.xi = 0;
+    case NVT:
+        data->N_f = 3 * system->N + 1;
+        //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+        if ( !control->restart || (control->restart && control->random_vel) )
+        {
+            data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin -
+                                                 data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->therm.v_xi_old = 0;
+            data->therm.xi = 0;
 #if defined(DEBUG_FOCUS)
-                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
-                        data->N_f, data->therm.v_xi );
+            fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                     data->therm.G_xi, control->Tau_T, data->E_Kin,
+                     data->N_f, data->therm.v_xi );
 #endif
-            }
+        }
 
-            *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-            break;
+        *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        break;
 
 
-        case NPT: // Anisotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->iso_bar.eps = 0.33333 * log(system->box.volume);
-                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-                //Compute_Pressure( system, data, workspace );
-            }
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
+    case NPT: // Anisotropic NPT
+        fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+        exit( UNKNOWN_OPTION );
+        data->N_f = 3 * system->N + 9;
+        if ( !control->restart )
+        {
+            data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin -
+                                                 data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->iso_bar.eps = 0.33333 * log(system->box.volume);
+            //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+            //Compute_Pressure( system, data, workspace );
+        }
+        *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+        break;
 
 
-        case sNPT: // Semi-Isotropic NPT
-            data->N_f = 3 * system->N + 4;
-            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-            break;
+    case sNPT: // Semi-Isotropic NPT
+        data->N_f = 3 * system->N + 4;
+        *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+        break;
 
 
-        case iNPT: // Isotropic NPT
-            data->N_f = 3 * system->N + 2;
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
+    case iNPT: // Isotropic NPT
+        data->N_f = 3 * system->N + 2;
+        *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+        break;
 
-        case bNVT: //berendensen NVT
-            data->N_f = 3 * system->N + 1; 
-            *Evolve = Velocity_Verlet_Berendsen_NVT;
-            break;
+    case bNVT:
+        data->N_f = 3 * system->N + 1;
+        *Evolve = Velocity_Verlet_Berendsen_NVT;
+        fprintf (stderr, " Initializing Velocity_Verlet_Berendsen_NVT .... \n");
+        break;
 
-        default:
-            break;
+    default:
+        break;
     }
 
     Compute_Kinetic_Energy( system, data );
 
-    /* init timing info for the host*/
+    /* init timing info */
     data->timing.start = Get_Time( );
     data->timing.total = data->timing.start;
     data->timing.nbrs = 0;
     data->timing.init_forces = 0;
     data->timing.bonded = 0;
     data->timing.nonb = 0;
-    data->timing.QEq = 0;
-    data->timing.matvecs = 0;
+    data->timing.QEq = ZERO;
+    data->timing.QEq_sort_mat_rows = ZERO;
+    data->timing.pre_comp = ZERO;
+    data->timing.pre_app = ZERO;
+    data->timing.solver_iters = 0;
+    data->timing.solver_spmv = ZERO;
+    data->timing.solver_vector_ops = ZERO;
+    data->timing.solver_orthog = ZERO;
+    data->timing.solver_tri_solve = ZERO;
 }
 
 
-void Init_Workspace( reax_system *system, control_params *control, 
+/* Initialize Taper params */
+void Init_Taper( control_params *control )
+{
+    real d1, d7;
+    real swa, swa2, swa3;
+    real swb, swb2, swb3;
+
+    swa = control->r_low;
+    swb = control->r_cut;
+
+    if ( fabs( swa ) > 0.01 )
+    {
+        fprintf( stderr, "Warning: non-zero value for lower Taper-radius cutoff\n" );
+    }
+
+    if ( swb < 0 )
+    {
+        fprintf( stderr, "Negative value for upper Taper-radius cutoff\n" );
+        exit( INVALID_INPUT );
+    }
+    else if ( swb < 5 )
+    {
+        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n",
+                swb );
+    }
+
+    d1 = swb - swa;
+    d7 = POW( d1, 7.0 );
+    swa2 = SQR( swa );
+    swa3 = CUBE( swa );
+    swb2 = SQR( swb );
+    swb3 = CUBE( swb );
+
+    control->Tap7 =  20.0 / d7;
+    control->Tap6 = -70.0 * (swa + swb) / d7;
+    control->Tap5 =  84.0 * (swa2 + 3.0 * swa * swb + swb2) / d7;
+    control->Tap4 = -35.0 * (swa3 + 9.0 * swa2 * swb + 9.0 * swa * swb2 + swb3 ) / d7;
+    control->Tap3 = 140.0 * (swa3 * swb + 3.0 * swa2 * swb2 + swa * swb3 ) / d7;
+    control->Tap2 = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
+    control->Tap1 = 140.0 * swa3 * swb3 / d7;
+    control->Tap0 = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
+                     7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+}
+
+
+void Init_Workspace( reax_system *system, control_params *control,
         static_storage *workspace )
-{  
+{
     int i;
 
     /* Allocate space for hydrogen bond list */
@@ -231,35 +289,27 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
     workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
 
-    workspace->Delta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta            = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp         = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp        = (real *) malloc( system->N * sizeof( real ) );
     workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
     workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp_temp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp              = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp_temp         = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Clp              = (real *) malloc( system->N * sizeof( real ) );
     workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->vlpex          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->vlpex            = (real *) malloc( system->N * sizeof( real ) );
 
     /* QEq storage */
-    //workspace->H        = NULL;
-    //workspace->L        = NULL;
-    //workspace->U        = NULL;
-    //
-    workspace->H.start        = NULL;
-    workspace->L.start        = NULL;
-    workspace->U.start        = NULL;
-
-    workspace->H.entries         = NULL;
-    workspace->L.entries         = NULL;
-    workspace->U.entries        = NULL;
-
+    workspace->H        = NULL;
+    workspace->H_sp     = NULL;
+    workspace->L        = NULL;
+    workspace->U        = NULL;
+    workspace->Hdia_inv = NULL;
     workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
     workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
     workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
     workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
     workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
@@ -273,25 +323,27 @@ void Init_Workspace( reax_system *system, control_params *control,
     // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
     // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
 
-    for( i = 0; i < system->N; ++i ) {
-        workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
+    for ( i = 0; i < system->N; ++i )
+    {
         workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
         workspace->b_t[i] = -1.0;
 
         workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b[i+system->N] = -1.0;
+        workspace->b[i + system->N] = -1.0;
     }
 
+    //TODO: conditionally allocate based on solver selection
     /* GMRES storage */
-    workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
-
-    workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
-    workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
-    workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
+    workspace->y  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    //TODO: unused?
+    workspace->z  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->g  = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->h  = (real *)  calloc( (RESTART + 1) * (RESTART + 1), sizeof( real ) );
+    workspace->hs = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    workspace->hc = (real *)  calloc( RESTART + 1, sizeof( real ) );
+    //TODO: unused?
+    workspace->rn = (real *)  calloc( (RESTART + 1) * system->N * 2, sizeof( real ) );
+    workspace->v  = (real *)  calloc( (RESTART + 1) * system->N, sizeof( real ) );
 
     /* CG storage */
     workspace->r = (real *) calloc( system->N, sizeof( real ) );
@@ -304,20 +356,25 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
     workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
 
-
     /* storage for analysis */
-    if( control->molec_anal || control->diffusion_coef )
+    if ( control->molec_anal || control->diffusion_coef )
     {
         workspace->mark = (int *) calloc( system->N, sizeof(int) );
         workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
     }
-    else 
+    else
+    {
         workspace->mark = workspace->old_mark = NULL;
+    }
 
-    if( control->diffusion_coef )
+    if ( control->diffusion_coef )
+    {
         workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
-    else workspace->x_old = NULL;
-
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
 #ifdef TEST_FORCES
     workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
@@ -344,9 +401,14 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->realloc.gcell_atoms = -1;
 
     Reset_Workspace( system, workspace );
+
+    /* Initialize Taper function */
+    Init_Taper( control );
 }
 
-void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N)
+
+void compare_far_neighbors( int *test, int *start, int *end,
+        far_neighbor_data *data, list *slist, int N )
 {
     int index = 0;
     int count = 0;
@@ -369,16 +431,19 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
        }
      */
 
-
-    for (i = 0; i < N; i++){
-        index = Start_Index (i, slist);
+    for (i = 0; i < N; i++)
+    {
+        index = Start_Index( i, slist );
         //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
 
-
-        for (j = start[i]; j < end[i]; j++){
+        for (j = start[i]; j < end[i]; j++)
+        {
             gpu = data[j];
 
-            if (i < data[j].nbr) continue;
+            if (i < data[j].nbr)
+            {
+                continue;
+            }
             /*
                if (i < data[j].nbr) {
             //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
@@ -386,7 +451,6 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             int dest = i;
             int x;
 
-
             for (x = start[src]; x < end[src]; x++) {
             if (data[x].nbr != dest) continue;
 
@@ -431,9 +495,11 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             cpu = slist->select.far_nbr_list[index];
             //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
             //if ( (gpu->d != cpu->d) ){
-            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
-                    (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-                    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d)
+                    ||(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1])
+                    || (cpu.dvec[2] != gpu.dvec[2]) || (cpu.rel_box[0] != gpu.rel_box[0])
+                    || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2]))
+            {
                 //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
                 //        (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
                 //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
@@ -457,16 +523,16 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
                 count ++;
             }
 
-        //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
-        index ++;
+            //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
+            index ++;
         }
 
-        if (index != End_Index (i, slist))
+        if (index != End_Index( i, slist ))
         {
             fprintf( stderr,
                 "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n",
                  i, index, Start_Index (i, slist), End_Index(i, slist),
-                    start[i], end[i]);
+                    start[i], end[i] );
             exit( 10 );
         }
     }
@@ -518,112 +584,119 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
 }
 
 
-void Init_Lists( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
+void Init_Lists( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
     int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
     int *hb_top, *bond_top;
 
-    real t_start, t_elapsed;
-
     num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
-#endif
-
-    if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS ) ) {
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
+    if ( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS) )
+    {
+        fprintf( stderr, "Problem in initializing far nbrs list. Terminating!\n" );
+        exit( CANNOT_INITIALIZE );
     }
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
-#endif
-
-    t_start = Get_Time ();
-    Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
-    t_elapsed = Get_Timing_Info ( t_start );
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
+             num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024) );
 #endif
 
+    Generate_Neighbor_Lists(system, control, data, workspace, lists, out_control);
     Htop = 0;
     hb_top = (int*) calloc( system->N, sizeof(int) );
     bond_top = (int*) calloc( system->N, sizeof(int) );
     num_3body = 0;
-    Estimate_Storage_Sizes( system, control, lists, 
+    Estimate_Storage_Sizes( system, control, lists,
             &Htop, hb_top, bond_top, &num_3body );
 
-    Allocate_Matrix( &(workspace->H), system->N, Htop );
+    if ( Allocate_Matrix( &(workspace->H), system->N, Htop ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
 
+    /* TODO: better estimate for H_sp?
+     *   If so, need to refactor Estimate_Storage_Sizes
+     *   to use various cut-off distances as parameters
+     *   (non-bonded, hydrogen, 3body, etc.) */
+    if ( Allocate_Matrix( &(workspace->H_sp), system->N, Htop ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
-    fprintf( stderr, "memory allocated: H = %ldMB\n", 
-            Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
+    fprintf( stderr, "memory allocated: H = %ldMB\n",
+            Htop * sizeof(sparse_matrix_entry) / (1024 * 1024) );
 #endif
 
     workspace->num_H = 0;
-    if( control->hb_cut > 0 ) {
+    if ( control->hb_cut > 0 )
+    {
         /* init H indexes */
-        for( i = 0; i < system->N; ++i )
-            if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
+        for ( i = 0; i < system->N; ++i )
+        {
+            // H atom
+            if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 )
+            {
                 workspace->hbond_index[i] = workspace->num_H++;
-            else workspace->hbond_index[i] = -1;
-
-        Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
-                hb_top, (*lists)+HBONDS );
-        num_hbonds = hb_top[system->N-1];
+            }
+            else
+            {
+                workspace->hbond_index[i] = -1;
+            }
+        }
 
-#ifdef __DEBUG_CUDA__
-        fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
-#endif
+        Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index,
+                hb_top, (*lists) + HBONDS );
+        num_hbonds = hb_top[system->N - 1];
 
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
-        fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-                num_hbonds * sizeof(hbond_data) / (1024*1024) );
+        fprintf( stderr, "memory allocated: hbonds = %ldMB\n",
+                 num_hbonds * sizeof(hbond_data) / (1024 * 1024) );
 #endif
     }
 
     /* bonds list */
-    Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
-    num_bonds = bond_top[system->N-1];
+    Allocate_Bond_List( system->N, bond_top, (*lists) + BONDS );
+    num_bonds = bond_top[system->N - 1];
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n",
+             num_bonds * sizeof(bond_data) / (1024 * 1024) );
 #endif
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " host num_3body : %d \n", num_3body);
-    fprintf (stderr, " host num_bonds : %d \n", num_bonds);
-#endif
+//fprintf (stderr, " **** sizeof 3 body : %d \n", sizeof (three_body_interaction_data));
+//fprintf (stderr, " **** num_3body : %d \n", num_3body);
+//fprintf (stderr, " **** num_bonds : %d \n", num_bonds);
 
     /* 3bodies list */
-    if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES )) {
+    if (!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists) + THREE_BODIES))
+    {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
-    fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
-            num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
+    fprintf( stderr, "memory allocated: 3-body = %ldMB\n",
+             num_3body * sizeof(three_body_interaction_data) / (1024 * 1024) );
 #endif
 
 #ifdef TEST_FORCES
-    if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
+    if (!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA ))
+    {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 
-    if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
+    if ( !Make_List( num_bonds, num_bonds * MAX_BONDS * 3, TYP_DBO, (*lists) + DBO ) )
+    {
         fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
-        exit( INIT_ERR );
+        exit( CANNOT_INITIALIZE );
     }
 #endif
 
@@ -632,83 +705,91 @@ void Init_Lists( reax_system *system, control_params *control,
 }
 
 
-void Init_Out_Controls(reax_system *system, control_params *control, 
+void Init_Out_Controls(reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control)
 {
     char temp[1000];
 
     /* Init trajectory file */
-    if( out_control->write_steps > 0 ) { 
+    if ( out_control->write_steps > 0 )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".trj" );
         out_control->trj = fopen( temp, "w" );
         out_control->write_header( system, control, workspace, out_control );
     }
 
-    if( out_control->energy_update_freq > 0 ) {
+    if ( out_control->energy_update_freq > 0 )
+    {
         /* Init out file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".out" );
         out_control->out = fopen( temp, "w" );
         fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
-                "step", "total energy", "poten. energy", "kin. energy", 
-                "temp.", "target", "volume", "press.", "target" );
+                 "step", "total energy", "poten. energy", "kin. energy",
+                 "temp.", "target", "volume", "press.", "target" );
         fflush( out_control->out );
 
         /* Init potentials file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".pot" );
         out_control->pot = fopen( temp, "w" );
-        fprintf( out_control->pot, 
-                "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
-                "etor", "econj", "evdw","ecoul", "epol" );
+        fprintf( out_control->pot,
+                 "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                 "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb",
+                 "etor", "econj", "evdw", "ecoul", "epol" );
         fflush( out_control->pot );
 
         /* Init log file */
         strcpy( temp, control->sim_name );
         strcat( temp, ".log" );
         out_control->log = fopen( temp, "w" );
-        fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
-                "step", "total", "neighbors", "init", "bonded", 
-                "nonbonded", "QEq", "matvec" );
+        fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                 "step", "total", "neighbors", "init", "bonded",
+                 "nonbonded", "QEq", "QEq Sort", "S iters", "Pre Comp", "Pre App",
+                 "S spmv", "S vec ops", "S orthog", "S tsolve" );
     }
 
     /* Init pressure file */
-    if( control->ensemble == NPT || 
-            control->ensemble == iNPT || 
-            control->ensemble == sNPT ) {
+    if ( control->ensemble == NPT ||
+            control->ensemble == iNPT ||
+            control->ensemble == sNPT )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".prs" );
         out_control->prs = fopen( temp, "w" );
         fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                "step", "norm_x", "norm_y", "norm_z", 
-                "press_x", "press_y", "press_z", "target_p", "volume" );
+                 "step", "norm_x", "norm_y", "norm_z",
+                 "press_x", "press_y", "press_z", "target_p", "volume" );
         fflush( out_control->prs );
     }
 
     /* Init molecular analysis file */
-    if( control->molec_anal ) {
+    if ( control->molec_anal )
+    {
         sprintf( temp, "%s.mol", control->sim_name );
         out_control->mol = fopen( temp, "w" );
-        if( control->num_ignored ) {
+        if ( control->num_ignored )
+        {
             sprintf( temp, "%s.ign", control->sim_name );
             out_control->ign = fopen( temp, "w" );
-        } 
+        }
     }
 
     /* Init electric dipole moment analysis file */
-    if( control->dipole_anal ) {
+    if ( control->dipole_anal )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".dpl" );
         out_control->dpl = fopen( temp, "w" );
-        fprintf( out_control->dpl, 
-                "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
+        fprintf( out_control->dpl,
+                 "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
         fflush( out_control->dpl );
     }
 
     /* Init diffusion coef analysis file */
-    if( control->diffusion_coef ) {
+    if ( control->diffusion_coef )
+    {
         strcpy( temp, control->sim_name );
         strcat( temp, ".drft" );
         out_control->drft = fopen( temp, "w" );
@@ -836,21 +917,22 @@ void Init_Out_Controls(reax_system *system, control_params *control,
 #endif
 
     /* Error handling */
-    /* if ( out_control->out == NULL || out_control->pot == NULL || 
-       out_control->log == NULL || out_control->mol == NULL || 
-       out_control->dpl == NULL || out_control->drft == NULL ||       
+    /* if ( out_control->out == NULL || out_control->pot == NULL ||
+       out_control->log == NULL || out_control->mol == NULL ||
+       out_control->dpl == NULL || out_control->drft == NULL ||
        out_control->pdb == NULL )
        {
        fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
-       exit( CANNOT_OPEN_OUTFILE );
+       exit( CANNOT_OPEN_FILE );
        }*/
 }
 
 
-void Initialize(reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, list **lists, 
+void Initialize(reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control, evolve_function *Evolve)
 {
+    real start, end;
     Randomize();
 
     Init_System( system, control, data );
@@ -870,10 +952,16 @@ void Initialize(reax_system *system, control_params *control,
     Init_Force_Test_Functions( );
 #endif
 
-    if( control->tabulate )
+    if ( control->tabulate )
+    {
+        start = Get_Time ();
         Make_LR_Lookup_Table( system, control );
+        end = Get_Timing_Info (start);
+
+        //fprintf (stderr, "Time for LR Lookup Table calculation is %f \n", end );
+    }
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "data structures have been initialized...\n" ); 
+    fprintf( stderr, "data structures have been initialized...\n" );
 #endif
 }
diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h
index 8c23806594a8f2b107ddb884efbf68e7b5fe27ff..947d81e6e50e96f325742c6d024c6011a900152c 100644
--- a/PuReMD-GPU/src/init_md.h
+++ b/PuReMD-GPU/src/init_md.h
@@ -31,10 +31,10 @@ extern "C"  {
 void Initialize( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls*, evolve_function* );
 
-void Generate_Initial_Velocities(reax_system *, real );
+void Generate_Initial_Velocities( reax_system *, real );
 
-void Init_Out_Controls(reax_system *, control_params *, static_storage *,
-        output_controls *);
+void Init_Out_Controls( reax_system *, control_params *, static_storage *,
+        output_controls * );
 
 #ifdef __cplusplus
 }
diff --git a/PuReMD-GPU/src/integrate.c b/PuReMD-GPU/src/integrate.c
index 482a9c89a302c052e9ac44ae2de446c61b1c6a3e..d65406f8824697a396c65337bd197ad771320e81 100644
--- a/PuReMD-GPU/src/integrate.c
+++ b/PuReMD-GPU/src/integrate.c
@@ -1,32 +1,32 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "integrate.h"
-
 #include "allocate.h"
 #include "box.h"
 #include "forces.h"
 #include "grid.h"
 #include "neighbors.h"
 #include "print_utils.h"
-#include "QEq.h"
+#include "qeq.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
@@ -34,9 +34,10 @@
 #include "list.h"
 
 
-void Velocity_Verlet_NVE(reax_system* system, control_params* control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+void Velocity_Verlet_NVE(reax_system* system, control_params* control,
+                         simulation_data *data, static_storage *workspace,
+                         list **lists, output_controls *out_control )
 {
     int i, steps, renbr;
     real inv_m, dt, dt_sqr;
@@ -46,53 +47,50 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
 
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &( system->box ) );
 
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - ");
 #endif
 
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr )
-    {
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );  
-    }
+    if ( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2\n");
 #endif
 }
 
 
-void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
-        control_params* control, 
-        simulation_data *data, 
-        static_storage *workspace, 
-        list **lists, 
+
+void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
+        control_params* control,
+        simulation_data *data,
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, itr, steps, renbr;
@@ -106,22 +104,17 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein:  coef to update velocity --> %6.10f\n", therm->v_xi_old);
-#endif
-
     /* Compute x(t + dt) and copy old forces */
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
-                0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
+                        0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
 
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
 
@@ -129,105 +122,88 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     }
     /* Compute xi(t + dt) */
     therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-
-    if( renbr )
-    {
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );
-    }
-
+    if ( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     /* Calculate Forces at time (t + dt) */
-    Compute_Forces( system,control,data, workspace, lists, out_control );
+    Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* Compute iteration constants for each atom's velocity */
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_Scale( workspace->v_const[i], 
-                1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
-        rvec_ScaledAdd( workspace->v_const[i], 
-                0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
-        rvec_ScaledAdd( workspace->v_const[i], 
-                0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
+        rvec_Scale( workspace->v_const[i],
+                    1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
+        rvec_ScaledAdd( workspace->v_const[i],
+                        0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
+        rvec_ScaledAdd( workspace->v_const[i],
+                        0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
 #if defined(DEBUG)
-        fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", 
-                i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, 
-                0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], 
-                workspace->v_const[i][1], workspace->v_const[i][2] );  
+        fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n",
+                 i, inv_m, 1.0 - 0.5 * dt * therm->v_xi,
+                 0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0],
+                 workspace->v_const[i][1], workspace->v_const[i][2] );
 #endif
     }
 
     v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
     E_kin_new = G_xi_new = v_xi_old = 0;
     itr = 0;
-    do {
-        itr++;      
+    do
+    {
+        itr++;
 
         /* new values become old in this iteration */
         v_xi_old = v_xi_new;
         coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
         E_kin_new = 0;
-
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
-        //print_sys_atoms (system);
-#endif
-
-        for( i = 0; i < system->N; ++i )
+        for ( i = 0; i < system->N; ++i )
         {
             rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
 
-            E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * 
-                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+            E_kin_new += ( 0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                           rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
 #if defined(DEBUG)
-            fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", 
-                    itr, i, coef_v, v_xi_old );
+            fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n",
+                     itr, i, coef_v, v_xi_old );
 #endif
         }
 
-        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
-                data->N_f * K_B * control->T );
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new -
+                                      data->N_f * K_B * control->T );
         v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
-
 #if defined(DEBUG)
         fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
-                itr, G_xi_new, v_xi_new, v_xi_old );
+                 itr, G_xi_new, v_xi_new, v_xi_old );
 #endif
     }
-    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+    while ( fabs(v_xi_new - v_xi_old ) > 1e-5 );
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
-#endif
-
-#ifndef __BUILD_DEBUG__
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
-    therm->G_xi = G_xi_new;  
-#endif 
-
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr,"vel scale\n" );
-#endif 
+    therm->G_xi = G_xi_new;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "vel scale\n" );
+#endif
 }
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, 
-        control_params* control, 
+void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
+        control_params* control,
         simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, steps, renbr;
@@ -237,94 +213,102 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
-    //fprintf( out_control->prs, 
-    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //fprintf( out_control->prs,
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n",
     //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
     fprintf( stderr, "step %d: ", data->step );
 #endif
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-          workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n",
+          workspace->orig_id[i],
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1],
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
     }
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
-    Reallocate( system, workspace, lists, renbr );  
+    Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr ) {
+    if ( renbr )
+    {
         Update_Grid( system );
         Generate_Neighbor_Lists( system, control, data, workspace,
-                lists, out_control );
+                                 lists, out_control );
     }
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute v(t + dt) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-           workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n",
+           workspace->orig_id[i],
            system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1],
            0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
     }
-    //Compute_Kinetic_Energy( system, data );   
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
     Compute_Pressure_Isotropic( system, control, data, out_control );
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2 - " );
 #endif
 
     /* pressure scaler */
     mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
-            1.0 / 3 );
-    if( mu < MIN_dV ) 
+              1.0 / 3 );
+    if ( mu < MIN_dV )
         mu = MIN_dV;
-    else if( mu > MAX_dV )
+    else if ( mu > MAX_dV )
         mu = MAX_dV;
 
     /* temperature scaler */
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
         lambda = MIN_dT;
     else if (lambda > MAX_dT )
         lambda = MAX_dT;
     lambda = SQRT( lambda );
 
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-        /* IMPORTANT: What Adri does with scaling positions first to 
-           unit coordinates and then back to cartesian coordinates essentially 
-           is scaling the coordinates with mu^2. However, this causes unphysical 
+        /* IMPORTANT: What Adri does with scaling positions first to
+           unit coordinates and then back to cartesian coordinates essentially
+           is scaling the coordinates with mu^2. However, this causes unphysical
            modifications on the system because box dimensions
            are being scaled with mu! We need to discuss this with Adri! */
         rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x );
     }
-    //Compute_Kinetic_Energy( system, data );
-#if defined(DEBUG_FOCUS)  
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
+
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "scaling - " );
 #endif
 
@@ -335,14 +319,14 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
 }
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, 
-        control_params* control, 
+void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
+        control_params* control,
         simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+        static_storage *workspace,
+        list **lists,
         output_controls *out_control )
 {
     int i, d, steps, renbr;
@@ -352,120 +336,139 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
-
 #if defined(DEBUG_FOCUS)
-    //fprintf( out_control->prs, 
-    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //fprintf( out_control->prs,
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n",
     //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
     fprintf( stderr, "step %d: ", data->step );
 #endif
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ ) {
-        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; 
+    for ( i = 0; i < system->N; i++ )
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute x(t + dt) */
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         /* Compute v(t + dt/2) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-          workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n",
+          workspace->orig_id[i],
           system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1],
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
     }
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
 
-    Reallocate( system, workspace, lists, renbr ); 
+    Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
-    if( renbr ) {
+    if ( renbr )
+    {
         Update_Grid( system );
-        Generate_Neighbor_Lists( system, control, data, workspace, 
-                lists, out_control );
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                                 lists, out_control );
     }
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute v(t + dt) */
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-           workspace->orig_id[i], 
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n",
+           workspace->orig_id[i],
            system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1],
            0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
     }
-    //Compute_Kinetic_Energy( system, data );   
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
     Compute_Pressure_Isotropic( system, control, data, out_control );
-
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2 - " );
 #endif
 
     /* pressure scaler */
-    for( d = 0; d < 3; ++d ){
-        mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]),
-                1.0 / 3 );
-        if( mu[d] < MIN_dV ) 
+    for ( d = 0; d < 3; ++d )
+    {
+        mu[d] = POW( 1.0 + (dt / control->Tau_P[d]) * (data->tot_press[d] - control->P[d]),
+                     1.0 / 3 );
+        if ( mu[d] < MIN_dV )
+        {
             mu[d] = MIN_dV;
-        else if( mu[d] > MAX_dV )
+        }
+        else if ( mu[d] > MAX_dV )
+        {
             mu[d] = MAX_dV;
+        }
     }
 
     /* temperature scaler */
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
+    {
         lambda = MIN_dT;
+    }
     else if (lambda > MAX_dT )
+    {
         lambda = MAX_dT;
+    }
     lambda = SQRT( lambda );
 
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-        /* IMPORTANT: What Adri does with scaling positions first to 
-           unit coordinates and then back to cartesian coordinates essentially 
-           is scaling the coordinates with mu^2. However, this causes unphysical 
+        /* IMPORTANT: What Adri does with scaling positions first to
+           unit coordinates and then back to cartesian coordinates essentially
+           is scaling the coordinates with mu^2. However, this causes unphysical
            modifications on the system because box dimensions
            are being scaled with mu! We need to discuss this with Adri! */
-        for( d = 0; d < 3; ++d )
+        for ( d = 0; d < 3; ++d )
             system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d];
     }
-    //Compute_Kinetic_Energy( system, data );
-#if defined(DEBUG_FOCUS)  
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
+
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "scaling - " );
 #endif
 
     Update_Box_SemiIsotropic( &(system->box), mu );
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "updated box & grid\n" );
 #endif
 }
 
 
+
 /************************************************/
 /* BELOW FUNCTIONS ARE NOT BEING USED ANYMORE!  */
 /*                                              */
 /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
 /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
 /************************************************/
+
 #ifdef ANISOTROPIC
 
-void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, 
-        control_params* control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
-        output_controls *out_control )
+void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
+                                     control_params* control,
+                                     simulation_data *data,
+                                     static_storage *workspace,
+                                     list **lists,
+                                     output_controls *out_control )
 {
     int i;
     real inv_m;
@@ -473,73 +476,77 @@ void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
     real dt_sqr = SQR(dt);
     rvec dx;
 
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // Compute x(t + dt)
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
+                        0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
         Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) );
 
         // Compute v(t + dt/2)
-        rvec_ScaledAdd( system->atoms[i].v, 
-                -0.5 * dt * data->therm.xi, system->atoms[i].v );
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
 
     // Compute zeta(t + dt/2), E_Kininetic(t + dt/2)
     // IMPORTANT: What will be the initial value of zeta? and what is g?
-    data->therm.xi += 0.5 * dt * control->Tau_T  * 
-        ( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
+    data->therm.xi += 0.5 * dt * control->Tau_T  *
+                      ( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
 
     Reset( system, control, data, workspace );
-    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+    fprintf(out_control->log, "reset-");
+    fflush( out_control->log );
 
-    Generate_Neighbor_Lists( system, control, data, workspace, 
-            lists, out_control );
-    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+    Generate_Neighbor_Lists( system, control, data, workspace,
+                             lists, out_control );
+    fprintf(out_control->log, "nbrs-");
+    fflush( out_control->log );
 
     /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
-    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
+    fprintf(out_control->log, "forces\n");
+    fflush( out_control->log );
 
-    //Compute_Kinetic_Energy( system, data );
+    //TODO: commented out for GPU version, why?
+#ifndef HAVE_CUDA
+    Compute_Kinetic_Energy( system, data );
+#endif
 
-    for( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // compute v(t + dt)
-        rvec_ScaledAdd( system->atoms[i].v, 
-                -0.5 * dt * data->therm.xi, system->atoms[i].v );
-        rvec_ScaledAdd( system->atoms[i].v, 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v,
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
 
     // Compute zeta(t + dt)
-    data->therm.xi += 0.5*dt * control->Tau_T  * ( 2.0 * data->E_Kin - 
-            data->N_f * K_B * control->T );
+    data->therm.xi += 0.5 * dt * control->Tau_T  * ( 2.0 * data->E_Kin -
+                      data->N_f * K_B * control->T );
 
-    fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", 
-            data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); 
+    fprintf( out_control->log, "Xi: %8.3f %8.3f %8.3f\n",
+             data->therm.xi, data->E_Kin, data->N_f * K_B * control->T );
     fflush( out_control->log );
 }
 
 
-void Velocity_Verlet_Isotropic_NPT( reax_system* system, 
-        control_params* control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+
+void Velocity_Verlet_Isotropic_NPT( reax_system* system, control_params* control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, itr;
-    real deps, v_eps_new=0, v_eps_old=0, G_xi_new;
-    real dxi, v_xi_new=0, v_xi_old=0, a_eps_new;
+    real deps, v_eps_new = 0, v_eps_old = 0, G_xi_new;
+    real dxi, v_xi_new = 0, v_xi_old = 0, a_eps_new;
     real inv_m, exp_deps, inv_3V;
     real E_kin, P_int, P_int_const;
     real coef_v, coef_v_eps;
@@ -552,37 +559,37 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
 
     // Here we just calculate how much to increment eps, xi, v_eps, v_xi.
     // Commits are done after positions and velocities of atoms are updated
-    // because position, velocity updates uses v_eps, v_xi terms; 
-    // yet we need EXP( deps ) to be able to calculate 
-    // positions and velocities accurately.  
-    iso_bar->a_eps = control->Tau_P * 
-        ( 3.0 * box->volume * (iso_bar->P - control->P) + 
-          6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
+    // because position, velocity updates uses v_eps, v_xi terms;
+    // yet we need EXP( deps ) to be able to calculate
+    // positions and velocities accurately.
+    iso_bar->a_eps = control->Tau_P *
+            ( 3.0 * box->volume * (iso_bar->P - control->P) +
+            6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
     deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps;
     exp_deps = EXP( deps );
 
-    therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + 
-            SQR( iso_bar->v_eps ) / control->Tau_P - 
-            (data->N_f +1) * K_B * control->T );
+    therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin +
+            SQR( iso_bar->v_eps ) / control->Tau_P -
+            (data->N_f + 1) * K_B * control->T );
     dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr;
 
-    fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n", 
+    fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n",
             iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-    fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n", 
+    fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n",
             therm->G_xi, therm->v_xi, therm->xi );
 
     // Update positions and velocities
-    // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values 
+    // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values
     // in the iteration not the old values at time t or before!
-    for (i=0; i < system->N; i++)
+    for (i = 0; i < system->N; i++)
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         // Compute x(t + dt)
-        rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, 
-                -( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ),
-                system->atoms[i].v );
-        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+        rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f,
+                        -( (2.0 + 3.0 / data->N_f) * iso_bar->v_eps + therm->v_xi ),
+                        system->atoms[i].v );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v,
                 0.5 * dt_sqr, workspace->a[i] );
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
         rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x );
@@ -597,39 +604,40 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
 
     // Calculate new forces, f(t + dt)
     Reset( system, control, data, workspace );
-    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+    fprintf(out_control->log, "reset-");
+    fflush( out_control->log );
 
-    Generate_Neighbor_Lists( system, control, data, workspace, 
-            lists, out_control );
-    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+    Generate_Neighbor_Lists( system, control, data, workspace,
+                             lists, out_control );
+    fprintf(out_control->log, "nbrs-");
+    fflush( out_control->log );
 
     /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
-    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
-
+    fprintf(out_control->log, "forces\n");
+    fflush( out_control->log );
 
     // Compute iteration constants for each atom's velocity and for P_internal
     // Compute kinetic energy for initial velocities of the iteration
     P_int_const = E_kin = 0;
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], 
-                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i],
+                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
         rvec_Add( dv, system->atoms[i].v );
         rvec_Scale( workspace->v_const[i], exp_deps, dv );
 
-        P_int_const += ( -F_CONV * 
-                rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
+        P_int_const += ( -F_CONV *
+                         rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
 
-        E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-                rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+        E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                  rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
     }
 
-
     // Compute initial p_int
     inv_3V = 1.0 / (3.0 * system->box.volume);
     P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
@@ -645,42 +653,38 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
         v_xi_old = v_xi_new;
         v_eps_old = v_eps_new;
 
-
-        for( i = 0; i < system->N; ++i )
+        for ( i = 0; i < system->N; ++i )
         {
-            coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * 
-                    ( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) );
+            coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps *
+                    ( (2.0 + 3.0 / data->N_f) * v_eps_old + v_xi_old ) );
             rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
         }
 
-
         coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-        a_eps_new = 3.0 * control->Tau_P * 
-            ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
-        v_eps_new = coef_v_eps * ( iso_bar->v_eps + 
+        a_eps_new = 3.0 * control->Tau_P *
+                ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
+        v_eps_new = coef_v_eps * ( iso_bar->v_eps +
                 0.5 * dt * ( iso_bar->a_eps + a_eps_new ) );
 
-
-        G_xi_new = control->Tau_T * ( 2.0 * E_kin + 
-                SQR( v_eps_old ) / control->Tau_P - 
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin +
+                SQR( v_eps_old ) / control->Tau_P -
                 (data->N_f + 1) * K_B * control->T );
         v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 
-
         E_kin = 0;
-        for( i = 0; i < system->N; ++i )
-            E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
-
-        P_int = inv_3V * ( 2.0*E_kin + P_int_const );
+        for ( i = 0; i < system->N; ++i )
+        {
+            E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass *
+                      rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+        }
 
+        P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
 
-        fprintf( out_control->log, 
-                "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", 
-                itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
+        fprintf( out_control->log,
+               "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n",
+               itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
     }
-    while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
-
+    while ( FABS(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
 
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
@@ -690,36 +694,30 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
     iso_bar->v_eps = v_eps_new;
     iso_bar->a_eps = a_eps_new;
 
-    fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", 
-            system->box.volume, 
-            system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] );
-    fprintf(out_control->log,"eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n", 
+    fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n",
+             system->box.volume,
+             system->box.box[0][0], system->box.box[1][1], system->box.box[2][2] );
+    fprintf(out_control->log, "eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n",
             iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-    fprintf(out_control->log,"xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n", 
+    fprintf(out_control->log, "xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n",
             therm->G_xi, therm->v_xi, therm->xi);
 }
 
 #endif
 
 
-/* uses Berendsen-type coupling for both T and P. 
-   All box dimensions are scaled by the same amount, 
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
    there is no change in the angles between axes. */
-void Velocity_Verlet_Berendsen_NVT( reax_system* system,
-        control_params* control,
-        simulation_data *data,
-        static_storage *workspace,
-        list **lists,
-        output_controls *out_control
-        )
+void Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i, steps, renbr;
     real inv_m, dt, lambda;
     rvec dx;
     reax_atom *atom;
 
-    fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step);
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d\n", data->step );
 #endif
@@ -729,12 +727,19 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
     renbr = (steps % control->reneighbor == 0);
 
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         atom = &(system->atoms[i]);
         inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
         /* Compute x(t + dt) */
         rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-        rvec_Add( atom->x, dx );
+
+        //TODO: used rvec_Add in GPU version -- which is correct?
+        /* bNVT fix - Metin's suggestion */
+        /* ORIGINAL CHANGE -- CHECK THE branch serial-bnvt for the fix */
+        //rvec_Add( atom->x, dx );
+        Inc_on_T3( atom->x, dx, &( system->box ) );
+
         /* Compute v(t + dt/2) */
         rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
     }
@@ -746,42 +751,50 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
 
-    if( renbr )
+    if ( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control );
+    }
 
-    Compute_Forces( system, control, data, workspace,
-            lists, out_control );
+    Compute_Forces( system, control, data, workspace, lists, out_control );
 
     /* velocity verlet, 2nd part */
-    for( i = 0; i < system->N; i++ ) {
+    for ( i = 0; i < system->N; i++ )
+    {
         atom = &(system->atoms[i]);
         inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
         /* Compute v(t + dt) */
         rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
     }
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf(stderr, "step%d: verlet2 done\n", data->step);
 #endif
 
     /* temperature scaler */
     Compute_Kinetic_Energy( system, data );
     lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
+    if ( lambda < MIN_dT )
+    {
         lambda = MIN_dT;
+    }
     else if (lambda > MAX_dT )
+    {
         lambda = MAX_dT;
+    }
     lambda = SQRT( lambda );
 
+    fprintf( stderr, "step:%d lambda -> %f \n", data->step, lambda );
+
     /* Scale velocities and positions at t+dt */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         atom = &(system->atoms[i]);
         rvec_Scale( atom->v, lambda, atom->v );
     }
     Compute_Kinetic_Energy( system, data );
 
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "step%d: scaled velocities\n",
-            data->step );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d: scaled velocities\n", data->step );
 #endif
 }
diff --git a/PuReMD-GPU/src/integrate.h b/PuReMD-GPU/src/integrate.h
index 6f5848f0de84e8a50ef2c5090194618b61f185fc..55b36c559f96d5e3d45ab69177e26f0d4136d8e1 100644
--- a/PuReMD-GPU/src/integrate.h
+++ b/PuReMD-GPU/src/integrate.h
@@ -23,29 +23,30 @@
 
 #include "mytypes.h"
 
+
 void Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Isotropic_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system*, control_params*,
-        simulation_data*, static_storage*,
-        list**, output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system*, control_params*,
-        simulation_data*,
-        static_storage*, list**,
-        output_controls* );
+        simulation_data*, static_storage*, list**, output_controls* );
+
 void Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
-        simulation_data *, static_storage *,
-        list **, output_controls * );
+        simulation_data *, static_storage *, list **, output_controls * );
+
 
 #endif
diff --git a/PuReMD-GPU/src/lin_alg.c b/PuReMD-GPU/src/lin_alg.c
index cb141d475b0e2cf702901ed551287e0e238cdcd6..1fc79f99f43e770a9e798082fec1e22c5ca4177f 100644
--- a/PuReMD-GPU/src/lin_alg.c
+++ b/PuReMD-GPU/src/lin_alg.c
@@ -1,319 +1,1654 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "lin_alg.h"
 
+#include "allocate.h"
 #include "list.h"
+#include "print_utils.h"
+#include "tool_box.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Sparse_MatVec( sparse_matrix *A, real *x, real *b )
+typedef enum
+{
+    LOWER = 0,
+    UPPER = 1,
+} TRIANGULARITY;
+
+
+/* global to make OpenMP shared (Sparse_MatVec) */
+#ifdef _OPENMP
+real *b_local = NULL;
+#endif
+/* global to make OpenMP shared (apply_preconditioner) */
+real *Dinv_L = NULL, *Dinv_U = NULL;
+/* global to make OpenMP shared (tri_solve_level_sched) */
+int levels = 1;
+int levels_L = 1, levels_U = 1;
+unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
+unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
+unsigned int *row_levels, *level_rows, *level_rows_cnt;
+unsigned int *top = NULL;
+/* global to make OpenMP shared (graph_coloring) */
+unsigned int *color = NULL;
+unsigned int *to_color = NULL;
+unsigned int *conflict = NULL;
+unsigned int *temp_ptr;
+unsigned int *recolor = NULL;
+unsigned int recolor_cnt;
+unsigned int *color_top = NULL;
+/* global to make OpenMP shared (sort_colors) */
+unsigned int *permuted_row_col = NULL;
+unsigned int *permuted_row_col_inv = NULL;
+real *y_p = NULL;
+/* global to make OpenMP shared (permute_vector) */
+real *x_p = NULL;
+unsigned int *mapping = NULL;
+sparse_matrix *H_full;
+sparse_matrix *H_p;
+/* global to make OpenMP shared (jacobi_iter) */
+real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
+
+
+/* sparse matrix-vector product Ax=b
+ * where:
+ *   A: lower triangular matrix, stored in CSR format
+ *   x: vector
+ *   b: vector (result) */
+static void Sparse_MatVec( const sparse_matrix * const A,
+        const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
+#ifdef _OPENMP
+    unsigned int tid;
+#endif
 
     n = A->n;
-    for( i = 0; i < n; ++i )
-        b[i] = 0;
+    Vector_MakeZero( b, n );
+
+#ifdef _OPENMP
+    tid = omp_get_thread_num();
+
+    #pragma omp master
+    {
+
+        /* keep b_local for program duration to avoid allocate/free
+         * overhead per Sparse_MatVec call*/
+        if ( b_local == NULL )
+        {
+            if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL )
+            {
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
+
+#endif
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        si = A->start[i];
+        ei = A->start[i + 1] - 1;
+
+        for ( k = si; k < ei; ++k )
+        {
+            j = A->j[k];
+            H = A->val[k];
+#ifdef _OPENMP
+            b_local[tid * n + j] += H * x[i];
+            b_local[tid * n + i] += H * x[j];
+#else
+            b[j] += H * x[i];
+            b[i] += H * x[j];
+#endif
+        }
+
+        // the diagonal entry is the last one in
+#ifdef _OPENMP
+        b_local[tid * n + i] += A->val[k] * x[i];
+#else
+        b[i] += A->val[k] * x[i];
+#endif
+    }
+#ifdef _OPENMP
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        for ( j = 0; j < omp_get_num_threads(); ++j )
+        {
+            b[i] += b_local[j * n + i];
+        }
+    }
+#endif
+
+}
+
+
+/* Transpose A and copy into A^T
+ *
+ * A: stored in CSR
+ * A_t: stored in CSR
+ */
+void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
+{
+    unsigned int i, j, pj, *A_t_top;
+
+    if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL )
+    {
+        fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
+
+    /* count nonzeros in each column of A^T, store one row greater (see next loop) */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            ++A_t->start[A->j[pj] + 1];
+        }
+    }
+
+    /* setup the row pointers for A^T */
+    for ( i = 1; i <= A->n; ++i )
+    {
+        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
+    }
+
+    /* fill in A^T */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            j = A->j[pj];
+            A_t->j[A_t_top[j]] = i;
+            A_t->val[A_t_top[j]] = A->val[pj];
+            ++A_t_top[j];
+        }
+    }
+
+    free( A_t_top );
+}
+
+
+/* Transpose A in-place
+ *
+ * A: stored in CSR
+ */
+void Transpose_I( sparse_matrix * const A )
+{
+    sparse_matrix * A_t;
+
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    Transpose( A, A_t );
+
+    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
+    memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
+    memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );
+
+    Deallocate_Matrix( A_t );
+}
+
+
+/* Apply diagonal inverse (Jacobi) preconditioner to system residual
+ *
+ * Hdia_inv: diagonal inverse preconditioner (constructed using H)
+ * y: current residual
+ * x: preconditioned residual
+ * N: length of preconditioner and vectors (# rows in H)
+ */
+static void diag_pre_app( const real * const Hdia_inv, const real * const y,
+                          real * const x, const int N )
+{
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < N; ++i )
+    {
+        x[i] = y[i] * Hdia_inv[i];
+    }
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * LU: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of LU (lower/upper)
+ *
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+static void tri_solve( const sparse_matrix * const LU, const real * const y,
+                       real * const x, const TRIANGULARITY tri )
+{
+    int i, pj, j, si, ei;
+    real val;
+
+    #pragma omp master
+    {
+        if ( tri == LOWER )
+        {
+            for ( i = 0; i < LU->n; ++i )
+            {
+                x[i] = y[i];
+                si = LU->start[i];
+                ei = LU->start[i + 1];
+                for ( pj = si; pj < ei - 1; ++pj )
+                {
+                    j = LU->j[pj];
+                    val = LU->val[pj];
+                    x[i] -= val * x[j];
+                }
+                x[i] /= LU->val[pj];
+            }
+        }
+        else
+        {
+            for ( i = LU->n - 1; i >= 0; --i )
+            {
+                x[i] = y[i];
+                si = LU->start[i];
+                ei = LU->start[i + 1];
+                for ( pj = si + 1; pj < ei; ++pj )
+                {
+                    j = LU->j[pj];
+                    val = LU->val[pj];
+                    x[i] -= val * x[j];
+                }
+                x[i] /= LU->val[si];
+            }
+        }
+    }
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * LU: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of LU (lower/upper)
+ * find_levels: perform level search if positive, otherwise reuse existing levels
+ *
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
+                                   real * const x, const TRIANGULARITY tri, int find_levels )
+{
+    int i, j, pj, local_row, local_level;
+
+    #pragma omp master
+    {
+        if ( tri == LOWER )
+        {
+            row_levels = row_levels_L;
+            level_rows = level_rows_L;
+            level_rows_cnt = level_rows_cnt_L;
+            levels = levels_L;
+        }
+        else
+        {
+            row_levels = row_levels_U;
+            level_rows = level_rows_U;
+            level_rows_cnt = level_rows_cnt_U;
+            levels = levels_U;
+        }
+
+        if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
+        {
+            if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
+                    || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
+                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            {
+                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        if ( top == NULL )
+        {
+            if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            {
+                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        /* find levels (row dependencies in substitutions) */
+        if ( find_levels == TRUE )
+        {
+            memset( row_levels, 0, LU->n * sizeof(unsigned int) );
+            memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) );
+            memset( top, 0, LU->n * sizeof(unsigned int) );
+            levels = 1;
+
+            if ( tri == LOWER )
+            {
+                for ( i = 0; i < LU->n; ++i )
+                {
+                    local_level = 1;
+                    for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
+                    {
+                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                    }
+
+                    levels = MAX( levels, local_level );
+                    row_levels[i] = local_level;
+                    ++level_rows_cnt[local_level];
+                }
+
+//#if defined(DEBUG)
+                fprintf(stderr, "levels(L): %d\n", levels);
+                fprintf(stderr, "NNZ(L): %d\n", LU->start[LU->n]);
+//#endif
+            }
+            else
+            {
+                for ( i = LU->n - 1; i >= 0; --i )
+                {
+                    local_level = 1;
+                    for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
+                    {
+                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                    }
+
+                    levels = MAX( levels, local_level );
+                    row_levels[i] = local_level;
+                    ++level_rows_cnt[local_level];
+                }
+
+//#if defined(DEBUG)
+                fprintf(stderr, "levels(U): %d\n", levels);
+                fprintf(stderr, "NNZ(U): %d\n", LU->start[LU->n]);
+//#endif
+            }
+
+            for ( i = 1; i < levels + 1; ++i )
+            {
+                level_rows_cnt[i] += level_rows_cnt[i - 1];
+                top[i] = level_rows_cnt[i];
+            }
+
+            for ( i = 0; i < LU->n; ++i )
+            {
+                level_rows[top[row_levels[i] - 1]] = i;
+                ++top[row_levels[i] - 1];
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    /* perform substitutions by level */
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < levels; ++i )
+        {
+            #pragma omp for schedule(static)
+            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
+            {
+                local_row = level_rows[j];
+                x[local_row] = y[local_row];
+                for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
+                {
+                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
+
+                }
+                x[local_row] /= LU->val[pj];
+            }
+        }
+    }
+    else
+    {
+        for ( i = 0; i < levels; ++i )
+        {
+            #pragma omp for schedule(static)
+            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
+            {
+                local_row = level_rows[j];
+                x[local_row] = y[local_row];
+                for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
+                {
+                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
+
+                }
+                x[local_row] /= LU->val[LU->start[local_row]];
+            }
+        }
+    }
+
+    #pragma omp master
+    {
+        /* save level info for re-use if performing repeated triangular solves via preconditioning */
+        if ( tri == LOWER )
+        {
+            row_levels_L = row_levels;
+            level_rows_L = level_rows;
+            level_rows_cnt_L = level_rows_cnt;
+            levels_L = levels;
+        }
+        else
+        {
+            row_levels_U = row_levels;
+            level_rows_U = level_rows;
+            level_rows_cnt_U = level_rows_cnt;
+            levels_U = levels;
+        }
+    }
+
+    #pragma omp barrier
+}
+
+
+static void compute_H_full( const sparse_matrix * const H )
+{
+    int count, i, pj;
+    sparse_matrix *H_t;
+
+    if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for full H. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( H, H_t );
+
+    count = 0;
+    for ( i = 0; i < H->n; ++i )
+    {
+        H_full->start[i] = count;
+
+        /* H: symmetric, lower triangular portion only stored */
+        for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
+        {
+            H_full->val[count] = H->val[pj];
+            H_full->j[count] = H->j[pj];
+            ++count;
+        }
+        /* H^T: symmetric, upper triangular portion only stored; 
+         * skip diagonal from H^T, as included from H above */
+        for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
+        {
+            H_full->val[count] = H_t->val[pj];
+            H_full->j[count] = H_t->j[pj];
+            ++count;
+        }
+    }
+    H_full->start[i] = count;
+
+    Deallocate_Matrix( H_t );
+}
+
+
+/* Iterative greedy shared-memory parallel graph coloring
+ *
+ * A: matrix to use for coloring, stored in CSR format;
+ *   rows represent vertices, columns of entries within a row represent adjacent vertices
+ *   (i.e., dependent rows for elimination during LU factorization)
+ * tri: triangularity of LU (lower/upper)
+ * color: vertex color (1-based)
+ *
+ * Reference:
+ * Umit V. Catalyurek et al.
+ * Graph Coloring Algorithms for Multi-core 
+ *  and Massively Threaded Architectures
+ * Parallel Computing, 2012
+ */
+void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
+{
+    #pragma omp parallel
+    {
+#define MAX_COLOR (500)
+        int i, pj, v;
+        unsigned int temp;
+        int *fb_color;
+
+        #pragma omp master
+        {
+            memset( color, 0, sizeof(unsigned int) * A->n );
+            recolor_cnt = A->n;
+        }
+
+        /* ordering of vertices to color depends on triangularity of factor
+         * for which coloring is to be used for */
+        if ( tri == LOWER )
+        {
+            #pragma omp for schedule(static)
+            for ( i = 0; i < A->n; ++i )
+            {
+                to_color[i] = i;
+            }
+        }
+        else
+        {
+            #pragma omp for schedule(static)
+            for ( i = 0; i < A->n; ++i )
+            {
+                to_color[i] = A->n - 1 - i;
+            }
+        }
+
+        if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL )
+        {
+            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        #pragma omp barrier
+
+        while ( recolor_cnt > 0 )
+        {
+            memset( fb_color, -1, sizeof(int) * MAX_COLOR );
+
+            /* color vertices */
+            #pragma omp for schedule(static)
+            for ( i = 0; i < recolor_cnt; ++i )
+            {
+                v = to_color[i];
+
+                /* colors of adjacent vertices are forbidden */
+                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                {
+                    if ( v != A->j[pj] )
+                    {
+                        fb_color[color[A->j[pj]]] = v;
+                    }
+                }
+
+                /* search for min. color which is not in conflict with adjacent vertices;
+                 * start at 1 since 0 is default (invalid) color for all vertices */
+                for ( pj = 1; fb_color[pj] == v; ++pj );
+
+                /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
+                color[v] = pj;
+            }
+
+            /* determine if recoloring required */
+            //TODO: switch to reduction on recolor_cnt (+) via parallel scan through recolor
+            #pragma omp master
+            {
+                temp = recolor_cnt;
+                recolor_cnt = 0;
+
+                for ( i = 0; i < temp; ++i )
+                {
+                    v = to_color[i];
+
+                    /* search for color conflicts with adjacent vertices */
+                    for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                    {
+                        if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
+                        {
+                            conflict[recolor_cnt] = v;
+                            color[v] = 0;
+                            ++recolor_cnt;
+                            break;
+                        }
+                    }
+                }
+
+                temp_ptr = to_color;
+                to_color = conflict;
+                conflict = temp_ptr;
+            }
+
+            #pragma omp barrier
+        }
+
+        free( fb_color );
+
+//#if defined(DEBUG)
+//    #pragma omp master
+//    {
+//        for ( i = 0; i < A->n; ++i )
+//            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
+//    }
+//#endif
+
+        #pragma omp barrier
+    }
+}
+
+
+/* Sort coloring
+ *
+ * n: number of entries in coloring
+ * tri: coloring to triangular factor to use (lower/upper)
+ */
+void sort_colors( const unsigned int n, const TRIANGULARITY tri )
+{
+    unsigned int i;
+
+    memset( color_top, 0, sizeof(unsigned int) * (n + 1) );
+
+    /* sort vertices by color (ascending within a color)
+     *  1) count colors
+     *  2) determine offsets of color ranges 
+     *  3) sort by color
+     *
+     *  note: color is 1-based */
+    for ( i = 0; i < n; ++i )
+    {
+        ++color_top[color[i]];
+    }
+    for ( i = 1; i < n + 1; ++i )
+    {
+        color_top[i] += color_top[i - 1];
+    }
+    for ( i = 0; i < n; ++i )
+    {
+        permuted_row_col[color_top[color[i] - 1]] = i;
+        ++color_top[color[i] - 1];
+    }
+
+    /* invert mapping to get map from current row/column to permuted (new) row/column */
+    for ( i = 0; i < n; ++i )
+    {
+        permuted_row_col_inv[permuted_row_col[i]] = i;
+    }
+}
+
+
+/* Apply permutation Q^T*x or Q*x based on graph coloring
+ *
+ * color: vertex color (1-based); vertices represent matrix rows/columns
+ * x: vector to permute (in-place)
+ * n: number of entries in x
+ * invert_map: if TRUE, use Q^T, otherwise use Q
+ * tri: coloring to triangular factor to use (lower/upper)
+ */
+static void permute_vector( real * const x, const unsigned int n, const int invert_map,
+       const TRIANGULARITY tri )
+{
+    unsigned int i;
+
+    #pragma omp master
+    {
+        if ( x_p == NULL )
+        {
+            if ( (x_p = (real*) malloc(sizeof(real) * n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for permuting vector. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+
+        if ( invert_map == TRUE )
+        {
+            mapping = permuted_row_col_inv;
+        }
+        else
+        {
+            mapping = permuted_row_col;
+        }
+    }
+
+    #pragma omp barrier
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < n; ++i )
+    {
+        x_p[i] = x[mapping[i]];
+    }
+
+    #pragma omp master
+    {
+        memcpy( x, x_p, sizeof(real) * n );
+    }
+
+    #pragma omp barrier
+}
+
+
+/* Apply permutation Q^T*(LU)*Q based on graph coloring
+ *
+ * color: vertex color (1-based); vertices represent matrix rows/columns
+ * LU: matrix to permute, stored in CSR format
+ * tri: triangularity of LU (lower/upper)
+ */
+void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
+{
+    int i, pj, nr, nc;
+    sparse_matrix *LUtemp;
+
+    if ( Allocate_Matrix( &LUtemp, LU->n, LU->m ) == FAILURE )
+    {
+        fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
+    memset( color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
+
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < LU->n; ++i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc <= nr )
+                {
+                    ++color_top[nr + 1];
+                }
+                /* correct entries to maintain triangularity (lower) */
+                else
+                {
+                    ++color_top[nc + 1];
+                }
+            }
+        }
+    }
+    else
+    {
+        for ( i = LU->n - 1; i >= 0; --i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc >= nr )
+                {
+                    ++color_top[nr + 1];
+                }
+                /* correct entries to maintain triangularity (upper) */
+                else
+                {
+                    ++color_top[nc + 1];
+                }
+            }
+        }
+    }
+
+    for ( i = 1; i < LU->n + 1; ++i )
+    {
+        color_top[i] += color_top[i - 1];
+    }
+
+    memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );
+
+    /* permute factor */
+    if ( tri == LOWER )
+    {
+        for ( i = 0; i < LU->n; ++i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc <= nr )
+                {
+                    LUtemp->j[color_top[nr]] = nc;
+                    LUtemp->val[color_top[nr]] = LU->val[pj];
+                    ++color_top[nr];
+                }
+                /* correct entries to maintain triangularity (lower) */
+                else
+                {
+                    LUtemp->j[color_top[nc]] = nr;
+                    LUtemp->val[color_top[nc]] = LU->val[pj];
+                    ++color_top[nc];
+                }
+            }
+        }
+    }
+    else
+    {
+        for ( i = LU->n - 1; i >= 0; --i )
+        {
+            nr = permuted_row_col_inv[i];
+
+            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
+            {
+                nc = permuted_row_col_inv[LU->j[pj]];
+
+                if ( nc >= nr )
+                {
+                    LUtemp->j[color_top[nr]] = nc;
+                    LUtemp->val[color_top[nr]] = LU->val[pj];
+                    ++color_top[nr];
+                }
+                /* correct entries to maintain triangularity (upper) */
+                else
+                {
+                    LUtemp->j[color_top[nc]] = nr;
+                    LUtemp->val[color_top[nc]] = LU->val[pj];
+                    ++color_top[nc];
+                }
+            }
+        }
+    }
+
+    memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
+    memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
+    memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );
+
+    Deallocate_Matrix( LUtemp );
+}
+
+
+/* Setup routines to build permuted QEq matrix H (via graph coloring),
+ *  used for preconditioning (incomplete factorizations computed based on
+ *  permuted H)
+ *
+ * H: symmetric, lower triangular portion only, stored in CSR format;
+ *  H is permuted in-place
+ */
+sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
+{
+    if ( color == NULL )
+    {
+        /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
+        if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (H->n + 1))) == NULL ||
+                (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (permuted_row_col_inv = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (y_p = (real*) malloc(sizeof(real) * H->n)) == NULL ||
+                (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
+                (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
+        {
+            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+
+    compute_H_full( H );
+
+    graph_coloring( H_full, LOWER );
+    sort_colors( H_full->n, LOWER );
+    
+    memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
+    memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
+    memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
+    permute_matrix( H_p, LOWER );
+
+    return H_p;
+}
+
+
+/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
+ * where:
+ *   G = I - D^{-1}R
+ *   R = triangular matrix
+ *   D = diagonal matrix, diagonals from R
+ *
+ * Note: used during the backsolves when applying preconditioners with
+ * triangular factors in iterative linear solvers
+ *
+ * Note: Newmann series arises from series expansion of the inverse of
+ * the coefficient matrix in the triangular system */
+static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
+        const real * const b, real * const x, const TRIANGULARITY tri, const
+        unsigned int maxiter )
+{
+    unsigned int i, k, si = 0, ei = 0, iter;
+
+    iter = 0;
+
+    #pragma omp master
+    {
+        if ( Dinv_b == NULL )
+        {
+            if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        if ( rp == NULL )
+        {
+            if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        if ( rp2 == NULL )
+        {
+            if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
+            {
+                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+    }
+
+    #pragma omp barrier
+
+    Vector_MakeZero( rp, R->n );
+
+    /* precompute and cache, as invariant in loop below */
+    #pragma omp for schedule(static)
+    for ( i = 0; i < R->n; ++i )
+    {
+        Dinv_b[i] = Dinv[i] * b[i];
+    }
+
+    do
+    {
+        // x_{k+1} = G*x_{k} + Dinv*b;
+        #pragma omp for schedule(guided)
+        for ( i = 0; i < R->n; ++i )
+        {
+            if (tri == LOWER)
+            {
+                si = R->start[i];
+                ei = R->start[i + 1] - 1;
+            }
+            else
+            {
+
+                si = R->start[i] + 1;
+                ei = R->start[i + 1];
+            }
+
+            rp2[i] = 0.;
+
+            for ( k = si; k < ei; ++k )
+            {
+                rp2[i] += R->val[k] * rp[R->j[k]];
+            }
+
+            rp2[i] *= -Dinv[i];
+            rp2[i] += Dinv_b[i];
+        }
+
+        #pragma omp master
+        {
+            rp3 = rp;
+            rp = rp2;
+            rp2 = rp3;
+        }
+
+        #pragma omp barrier
+
+        ++iter;
+    }
+    while ( iter < maxiter );
+
+    Vector_Copy( x, rp, R->n );
+}
+
+
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * workspace: data struct containing matrices, lower/upper triangular, stored in CSR
+ * control: data struct containing parameters
+ * y: constants in linear system (RHS)
+ * x: solution
+ * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
+ *
+ * Assumptions:
+ *   Matrices have non-zero diagonals
+ *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
+static void apply_preconditioner( const static_storage * const workspace,
+        const control_params * const control, const real * const y,
+        real * const x, const int fresh_pre )
+{
+    int i, si;
+
+    switch ( control->pre_app_type )
+    {
+    case NONE_PA:
+        break;
+    case TRI_SOLVE_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve( workspace->L, y, x, LOWER );
+            tri_solve( workspace->U, x, x, UPPER );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+        break;
+    case TRI_SOLVE_LEVEL_SCHED_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+        break;
+    case TRI_SOLVE_GC_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            #pragma omp master
+            {
+                memcpy( y_p, y, sizeof(real) * workspace->H->n );
+            }
 
-    for( i = 0; i < n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1]-1;
+            #pragma omp barrier
 
-        for( k = si; k < ei; ++k ) {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            b[j] += H * x[i]; 
-            b[i] += H * x[j];
+            permute_vector( y_p, workspace->H->n, FALSE, LOWER );
+            tri_solve_level_sched( workspace->L, y_p, x, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
+            permute_vector( x, workspace->H->n, TRUE, UPPER );
+        break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
         }
+        break;
+    case JACOBI_ITER_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            #pragma omp master
+            {
+                if ( Dinv_L == NULL )
+                {
+                    if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
+                    {
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
+                    }
+                }
+            }
 
-        // the diagonal entry is the last one in
-        b[i] += A->entries[k].val * x[i]; 
-    }
-}
+            #pragma omp barrier
 
+            /* construct D^{-1}_L */
+            if ( fresh_pre == TRUE )
+            {
+                #pragma omp for schedule(static)
+                for ( i = 0; i < workspace->L->n; ++i )
+                {
+                    si = workspace->L->start[i + 1] - 1;
+                    Dinv_L[i] = 1. / workspace->L->val[si];
+                }
+            }
 
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
-{
-    int i, pj, j, si, ei;
-    real val;
+            jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );
 
-    for( i = 0; i < L->n; ++i ) {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->start[i+1];
-        for( pj = si; pj < ei-1; ++pj ){
-            j = L->entries[pj].j;
-            val = L->entries[pj].val;
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->entries[pj].val;
-    }
-}
+            #pragma omp master
+            {
+                if ( Dinv_U == NULL )
+                {
+                    if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
+                    {
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
+                    }
+                }
+            }
 
+            #pragma omp barrier
 
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
-{
-    int i, pj, j, si, ei;
-    real val;
+            /* construct D^{-1}_U */
+            if ( fresh_pre == TRUE )
+            {
+                #pragma omp for schedule(static)
+                for ( i = 0; i < workspace->U->n; ++i )
+                {
+                    si = workspace->U->start[i];
+                    Dinv_U[i] = 1. / workspace->U->val[si];
+                }
+            }
 
-    for( i = U->n-1; i >= 0; --i ) {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->start[i+1];
-        for( pj = si+1; pj < ei; ++pj ){
-            j = U->entries[pj].j;
-            val = U->entries[pj].val;
-            x[i] -= val * x[j];
+            jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
         }
-        x[i] /= U->entries[si].val;
+        break;
+    default:
+        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
     }
+
+    return;
 }
 
 
-int GMRES( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system* system)
+/* generalized minimual residual iterative solver for sparse linear systems */
+int GMRES( const static_storage * const workspace, const control_params * const control,
+           simulation_data * const data, const sparse_matrix * const H,
+           const real * const b, const real tol, real * const x,
+           const FILE * const fout, const int fresh_pre )
 {
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
+    int i, j, k, itr, N, g_j, g_itr;
+    real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
 
     N = H->n;
-    bnorm = Norm( b, N );
-
-    /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
-
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
 
-        for( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
+    #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \
+        shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
+    {
+        #pragma omp master
+        {
+            time_start = Get_Time( );
+        }
+        bnorm = Norm( b, N );
+        #pragma omp master
+        {
+            data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+        }
 
+        if ( control->pre_comp_type == DIAG_PC )
+        {
+            /* apply preconditioner to RHS */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre );
+            #pragma omp master
+            {
+                data->timing.pre_app += Get_Timing_Info( time_start );
+            }
+        }
 
-        Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N );
-        Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system->N)], N );
+        /* GMRES outer-loop */
+        for ( itr = 0; itr < MAX_ITR; ++itr )
+        {
+            /* calculate r0 */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            Sparse_MatVec( H, x, workspace->b_prm );
+            #pragma omp master
+            {
+                data->timing.solver_spmv += Get_Timing_Info( time_start );
+            }
 
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+            if ( control->pre_comp_type == DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
+            }
 
-            for( k = 0; k < N; ++k )  
-                workspace->v[ index_wkspace_sys (j+1,k,system->N)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
+            if ( control->pre_comp_type == DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Vector_Sum( workspace->v, 1., workspace->b_prc, -1., workspace->b_prm, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+            }
+            else
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Vector_Sum( workspace->v, 1., b, -1., workspace->b_prm, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+            }
 
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
-                workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                        -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            if ( control->pre_comp_type != DIAG_PC )
+            {
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control, workspace->v, workspace->v,
+                        itr == 0 ? fresh_pre : FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
             }
 
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+            }
+            ret_temp = Norm( workspace->v, N );
+            #pragma omp single
+            {
+                workspace->g[0] = ret_temp;
+            }
+            Vector_Scale( workspace->v, 1. / workspace->g[0], workspace->v, N );
+            #pragma omp master
+            {
+                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+            }
 
-            workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+            /* GMRES inner-loop */
+            for ( j = 0; j < RESTART && FABS(workspace->g[j]) / bnorm > tol; j++ )
+            {
+                /* matvec */
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                Sparse_MatVec( H, workspace->v + j * N, workspace->v + (j + 1) * N );
+                #pragma omp master
+                {
+                    data->timing.solver_spmv += Get_Timing_Info( time_start );
+                }
 
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                apply_preconditioner( workspace, control,
+                        workspace->v + (j + 1) * N, workspace->v + (j + 1) * N, FALSE );
+                #pragma omp master
+                {
+                    data->timing.pre_app += Get_Timing_Info( time_start );
+                }
 
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i <= j; i++ )    {
-                if( i == j ) {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                if ( control->pre_comp_type == DIAG_PC )
+                {
+                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                    #pragma omp master
+                    {
+                        time_start = Get_Time( );
+                    }
+                    for ( i = 0; i <= j; i++ )
+                    {
+                        workspace->h[(RESTART + 1) * i + j] =
+                            Dot( workspace->v + i * N, workspace->v + (j + 1) * N, N );
+                        Vector_Add( workspace->v + (j + 1) * N, -workspace->h[(RESTART + 1) * i + j],
+                                workspace->v + i * N, N );
+                    }
+                    #pragma omp master
+                    {
+                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    }
+                }
+                else
+                {
+                    //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
+                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                    #pragma omp master
+                    {
+                        time_start = Get_Time( );
+                        for ( i = 0; i < j - 1; i++ )
+                        {
+                            workspace->h[(RESTART + 1) * i + j] = 0;
+                        }
+                    }
+
+                    for ( i = MAX(j - 1, 0); i <= j; i++ )
+                    {
+                        ret_temp = Dot( workspace->v + i * N, workspace->v + (j + 1) * N, N );
+                        #pragma omp single
+                        {
+                            workspace->h[(RESTART + 1) * i + j] = ret_temp;
+                        }
+                        Vector_Add( workspace->v + (j + 1) * N,
+                                -workspace->h[(RESTART + 1) * i + j], workspace->v + i * N, N );
+                    }
+                    #pragma omp master
+                    {
+                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    }
                 }
 
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                }
+                ret_temp = Norm( workspace->v + (j + 1) * N, N );
+                #pragma omp single
+                {
+                    workspace->h[(RESTART + 1) * (j + 1) + j] = ret_temp;
+                }
+                Vector_Scale( workspace->v + (j + 1) * N,
+                              1. / workspace->h[(RESTART + 1) * (j + 1) + j],
+                              workspace->v + (j + 1) * N, N );
+                #pragma omp master
+                {
+                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                }
+#if defined(DEBUG)
+                fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+#endif
 
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
+                #pragma omp master
+                {
+                    time_start = Get_Time( );
+                    if ( control->pre_comp_type == DIAG_PC )
+                    {
+                        /* Givens rotations on the upper-Hessenberg matrix to make it U */
+                        for ( i = 0; i <= j; i++ )
+                        {
+                            if ( i == j )
+                            {
+                                cc = SQRT( SQR(workspace->h[(RESTART + 1) * j + j])
+                                        + SQR(workspace->h[(RESTART + 1) * (j + 1) + j]) );
+                                workspace->hc[j] = workspace->h[(RESTART + 1) * j + j] / cc;
+                                workspace->hs[j] = workspace->h[(RESTART + 1) * (j + 1) + j] / cc;
+                            }
+
+                            tmp1 =  workspace->hc[i] * workspace->h[(RESTART + 1) * i + j] +
+                                workspace->hs[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+                            tmp2 = -workspace->hs[i] * workspace->h[(RESTART + 1) * i + j] +
+                                workspace->hc[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+
+                            workspace->h[(RESTART + 1) * i + j] = tmp1;
+                            workspace->h[(RESTART + 1) * (i + 1) + j] = tmp2;
+                        }
+                    }
+                    else
+                    {
+                        //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
+                        /* Givens rotations on the upper-Hessenberg matrix to make it U */
+                        for ( i = MAX(j - 1, 0); i <= j; i++ )
+                        {
+                            if ( i == j )
+                            {
+                                cc = SQRT( SQR(workspace->h[(RESTART + 1) * j + j])
+                                        + SQR(workspace->h[(RESTART + 1) * (j + 1) + j]) );
+                                workspace->hc[j] = workspace->h[(RESTART + 1) * j + j] / cc;
+                                workspace->hs[j] = workspace->h[(RESTART + 1) * (j + 1) + j] / cc;
+                            }
+
+                            tmp1 =  workspace->hc[i] * workspace->h[(RESTART + 1) * i + j] +
+                                    workspace->hs[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+                            tmp2 = -workspace->hs[i] * workspace->h[(RESTART + 1) * i + j] +
+                                   workspace->hc[i] * workspace->h[(RESTART + 1) * (i + 1) + j];
+
+                            workspace->h[(RESTART + 1) * i + j] = tmp1;
+                            workspace->h[(RESTART + 1) * (i + 1) + j] = tmp2;
+                        }
+                    }
+
+                    /* apply Givens rotations to the rhs as well */
+                    tmp1 =  workspace->hc[j] * workspace->g[j];
+                    tmp2 = -workspace->hs[j] * workspace->g[j];
+                    workspace->g[j] = tmp1;
+                    workspace->g[j + 1] = tmp2;
+                    data->timing.solver_orthog += Get_Timing_Info( time_start );
+                }
 
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
+                #pragma omp barrier
 
-            // fprintf( stderr, "h: " );
-            // for( i = 0; i <= j+1; ++i )
-            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            // fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
+                //fprintf( stderr, "h: " );
+                //for( i = 0; i <= j+1; ++i )
+                //fprintf( stderr, "%.6f ", workspace->h[i][j] );
+                //fprintf( stderr, "\n" );
+                //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+            }
 
+            /* solve Hy = g: H is now upper-triangular, do back-substitution */
+            #pragma omp master
+            {
+                time_start = Get_Time( );
+                for ( i = j - 1; i >= 0; i-- )
+                {
+                    temp = workspace->g[i];
+                    for ( k = j - 1; k > i; k-- )
+                    {
+                        temp -= workspace->h[(RESTART + 1) * i + k] * workspace->y[k];
+                    }
 
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+                    workspace->y[i] = temp / workspace->h[(RESTART + 1) * i + i];
+                }
+                data->timing.solver_tri_solve += Get_Timing_Info( time_start );
 
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-        }
+                /* update x = x_0 + Vy */
+                time_start = Get_Time( );
+            }
+            Vector_MakeZero( workspace->p, N );
+            for ( i = 0; i < j; i++ )
+            {
+                Vector_Add( workspace->p, workspace->y[i], workspace->v + i * N, N );
+            }
 
+            Vector_Add( x, 1., workspace->p, N );
+            #pragma omp master
+            {
+                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+            }
 
-        /* update x = x_0 + Vy */
-        for( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            /* stopping condition */
+            if ( FABS(workspace->g[j]) / bnorm <= tol )
+            {
+                break;
+            }
+        }
 
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
+        #pragma omp master
+        {
+            g_itr = itr;
+            g_j = j;
+        }
     }
 
     // Sparse_MatVec( H, x, workspace->b_prm );
     // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
     // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
     // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
     // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n",
     //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
+    // data->timing.solver_iters += itr * RESTART + j;
 
-    if( itr >= MAX_ITR ) {
+    if ( g_itr >= MAX_ITR )
+    {
         fprintf( stderr, "GMRES convergence failed\n" );
         // return -1;
-        return itr * (RESTART+1) + j + 1;
+        return g_itr * (RESTART + 1) + g_j + 1;
     }
 
-    return itr * (RESTART+1) + j + 1;
+    return g_itr * (RESTART + 1) + g_j + 1;
 }
 
 
-int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system *system)
+int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control,
+                       simulation_data * const data, const sparse_matrix * const H,
+                       const real * const b, real tol, real * const x,
+                       const FILE * const fout, const int fresh_pre )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART+2][10000], w[RESTART+2];
-    real u[RESTART+2][10000];
+    real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
+    real u[RESTART + 2][10000];
 
     N = H->n;
     bnorm = Norm( b, N );
 
     /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+    for ( i = 0; i < N; ++i )
+    {
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
+    }
 
     // memset( x, 0, sizeof(real) * N );
 
     /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
+    for ( itr = 0; itr < MAX_ITR; ++itr )
+    {
         /* compute z = r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        for( i = 0; i < N; ++i )
+        Sparse_MatVec( H, x, workspace->b_prm );
+        for ( i = 0; i < N; ++i )
+        {
             workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+        }
         Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
 
-        Vector_MakeZero( w, RESTART+1 );
+        Vector_MakeZero( w, RESTART + 1 );
         w[0] = Norm( z[0], N );
 
         Vector_Copy( u[0], z[0], N );
         u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
         Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
 
-        w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
+        w[0] *= ( u[0][0] < 0.0 ?  1 : -1 );
         // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
         /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
+        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+        {
             /* compute v_j */
             Vector_Scale( z[j], -2 * u[j][j], u[j], N );
             z[j][j] += 1.; /* due to e_j */
 
-            for( i = j-1; i >= 0; --i )
-                Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
-
+            for ( i = j - 1; i >= 0; --i )
+            {
+                Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
+            }
 
             /* matvec */
             Sparse_MatVec( H, z[j], v );
 
-            for( k = 0; k < N; ++k )
+            for ( k = 0; k < N; ++k )
+            {
                 v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
+            }
 
-            for( i = 0; i <= j; ++i )
-                Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
-
+            for ( i = 0; i <= j; ++i )
+            {
+                Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
+            }
 
-            if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
+            if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
+            {
                 /* compute the HouseHolder unit vector u_j+1 */
-                for( i = 0; i <= j; ++i )  
-                    u[j+1][i] = 0;
+                for ( i = 0; i <= j; ++i )
+                {
+                    u[j + 1][i] = 0;
+                }
 
-                Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
+                Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
 
-                u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
+                u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
 
-                Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
+                Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
 
                 /* overwrite v with P_m+1 * v */
-                v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
-                Vector_MakeZero( v + (j+2), N - (j+2) );
+                v[j + 1] -= 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
+                Vector_MakeZero( v + (j + 2), N - (j + 2) );
                 // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
             }
 
 
             /* prev Givens rots on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i < j; i++ ) {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
+            for ( i = 0; i < j; i++ )
+            {
+                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
+                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
 
                 v[i]   = tmp1;
-                v[i+1] = tmp2;
+                v[i + 1] = tmp2;
             }
 
             /* apply the new Givens rotation to H and right-hand side */
-            if( fabs(v[j+1]) >= ALMOST_ZERO )    {
-                cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
+            if ( fabs(v[j + 1]) >= ALMOST_ZERO )
+            {
+                cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
                 workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j+1] / cc;
+                workspace->hs[j] = v[j + 1] / cc;
 
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
+                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
+                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
 
                 v[j]   = tmp1;
-                v[j+1] = tmp2;
+                v[j + 1] = tmp2;
 
                 /* Givens rotations to rhs */
                 tmp1 =  workspace->hc[j] * w[j];
                 tmp2 = -workspace->hs[j] * w[j];
                 w[j]   = tmp1;
-                w[j+1] = tmp2;
+                w[j + 1] = tmp2;
             }
 
             /* extend R */
-            for( i = 0; i <= j; ++i )
-                workspace->h[ index_wkspace_res (i,j) ] = v[i];
+            for ( i = 0; i <= j; ++i )
+            {
+                workspace->h[(RESTART + 1) * i + j] = v[i];
+            }
 
 
             // fprintf( stderr, "h:" );
@@ -326,12 +1661,15 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
 
         /* solve Hy = w.
            H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = w[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+        for ( i = j - 1; i >= 0; i-- )
+        {
+            temp = w[i];
+            for ( k = j - 1; k > i; k-- )
+            {
+                temp -= workspace->h[(RESTART + 1) * i + k] * workspace->y[k];
+            }
 
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+            workspace->y[i] = temp / workspace->h[(RESTART + 1) * i + i];
         }
 
         // fprintf( stderr, "y: " );
@@ -345,9 +1683,9 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
         //   {
         //     Vector_Copy( v, z, N );
         //     v[i] += workspace->y[i];
-        //    
+        //
         //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
-        //   }      
+        //   }
         //
         // fprintf( stderr, "\nz: " );
         // for( k = 0; k < N; ++k )
@@ -358,16 +1696,20 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
         //   fprintf( stderr, "%6.2f ", x[i] );
 
         // Vector_Add( x, 1, z, N );
-        for( i = j-1; i >= 0; i-- )
+        for ( i = j - 1; i >= 0; i-- )
+        {
             Vector_Add( x, workspace->y[i], z[i], N );
+        }
 
         // fprintf( stderr, "\nx_aft: " );
         // for( i = 0; i < N; ++i )
         //   fprintf( stderr, "%6.2f ", x[i] );
 
         /* stopping condition */
-        if( fabs( w[j] ) / bnorm <= tol )
+        if ( fabs( w[j] ) / bnorm <= tol )
+        {
             break;
+        }
     }
 
     // Sparse_MatVec( H, x, workspace->b_prm );
@@ -376,152 +1718,26 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
 
     // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
     // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
     // workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
-    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
+    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n",
     //         itr, j, fabs( workspace->g[j] ) / bnorm );
 
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART+1) + j + 1;
-    }
-
-    return itr * (RESTART+1) + j + 1;
-}
-
-
-int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, 
-        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
-{
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-
-    N = H->n;
-    bnorm = Norm( b, N );
-
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr )
+    if ( itr >= MAX_ITR )
     {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system->N)], 1., b, -1., workspace->b_prm, N );
-        Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
-        Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system->N)], N );
-        Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system->N)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system->N)], N );
-        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
-        {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system->N)], &workspace->v[index_wkspace_sys (j+1,0,system->N)] );
-            Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
-            Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
-
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i < j-1; i++ )
-            {
-                workspace->h[ index_wkspace_res (i,j)] = 0;
-            }
-
-            //for( i = 0; i <= j; i++ ) {
-            for( i = MAX(j-1,0); i <= j; i++ ) {
-                workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
-            }
-
-            workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system->N)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = MAX(j-1,0); i <= j; i++ )
-            {
-                if( i == j )
-                {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
-                    workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
-
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
-
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
-
-            //fprintf( stderr, "h: " );
-            //for( i = 0; i <= j+1; ++i )
-            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            //fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-
-
-        /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- )
-        {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-            {
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-            }
-
-            workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
-        }
-
-        /* update x = x_0 + Vy */
-        Vector_MakeZero( workspace->p, N );
-        for( i = 0; i < j; i++ )
-            Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
-        //Backward_Subs( U, workspace->p, workspace->p );
-        //Forward_Subs( L, workspace->p, workspace->p );
-        Vector_Add( x, 1., workspace->p, N );
-
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-        {
-            break;
-        }
-    }
-
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
-
-    if( itr >= MAX_ITR ) {
         fprintf( stderr, "GMRES convergence failed\n" );
         // return -1;
-        return itr * (RESTART+1) + j + 1;
+        return itr * (RESTART + 1) + j + 1;
     }
 
-    return itr * (RESTART+1) + j + 1;
+    return itr * (RESTART + 1) + j + 1;
 }
 
 
-int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
-        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
+/* Preconditioned Conjugate Gradient */
+int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
+         sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout )
 {
     int  i, N;
     real tmp, alpha, beta, b_norm, r_norm;
@@ -537,12 +1753,12 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
     //Print_Soln( workspace, x, q, b, N );
     //fprintf( stderr, "res: %.15e\n", r_norm );
 
-    Forward_Subs( L, workspace->r, workspace->d );
-    Backward_Subs( U, workspace->d, workspace->p );
+    tri_solve( L, workspace->r, workspace->d, LOWER );
+    tri_solve( U, workspace->d, workspace->p, UPPER );
     sig_new = Dot( workspace->r, workspace->p, N );
     sig0 = sig_new;
 
-    for( i = 0; i < 200 && r_norm/b_norm > tol; ++i )
+    for ( i = 0; i < 200 && r_norm / b_norm > tol; ++i )
     {
         //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
         Sparse_MatVec( A, workspace->p, workspace->q );
@@ -556,8 +1772,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
         r_norm = Norm(workspace->r, N);
         //fprintf( stderr, "res: %.15e\n", r_norm );
 
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->d );
+        tri_solve( L, workspace->r, workspace->d, LOWER );
+        tri_solve( U, workspace->d, workspace->d, UPPER );
         sig_old = sig_new;
         sig_new = Dot( workspace->r, workspace->d, N );
         beta = sig_new / sig_old;
@@ -565,7 +1781,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
     }
 
     //fprintf( fout, "CG took %d iterations\n", i );
-    if( i >= 200 ) {
+    if ( i >= 200 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
         return i;
     }
@@ -574,8 +1791,9 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
 }
 
 
-int CG( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system *system)
+/* Conjugate Gradient */
+int CG( static_storage *workspace, sparse_matrix *H,
+        real *b, real tol, real *x, FILE *fout )
 {
     int  i, j, N;
     real tmp, alpha, beta, b_norm;
@@ -587,29 +1805,34 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for( j = 0; j < N; ++j )
+    for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig_new = Dot( workspace->r, workspace->d, N );
     sig0 = sig_new;
     //Print_Soln( workspace, x, q, b, N );
-    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
+    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n",
     // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
     //fprintf( stderr, "sig_new: %f\n", sig_new );
 
-    for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
+    for ( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
+    {
         //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
         Sparse_MatVec( H, workspace->d, workspace->q );
         tmp = Dot( workspace->d, workspace->q, N );
         //fprintf( stderr, "tmp: %f\n", tmp );
-        alpha = sig_new / tmp;    
+        alpha = sig_new / tmp;
         Vector_Add( x, alpha, workspace->d, N );
         //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
         //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
 
-        Vector_Add( workspace->r, -alpha, workspace->q, N );    
-        for( j = 0; j < N; ++j )
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
+        for ( j = 0; j < N; ++j )
+        {
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         sig_old = sig_new;
         sig_new = Dot( workspace->r, workspace->p, N );
@@ -620,7 +1843,8 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
     fprintf( stderr, "CG took %d iterations\n", i );
 
-    if( i >= 300 ) {
+    if ( i >= 300 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
         return i;
     }
@@ -630,8 +1854,8 @@ int CG( static_storage *workspace, sparse_matrix *H,
 
 
 /* Steepest Descent */
-int SDM( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout )
+int SDM( static_storage *workspace, sparse_matrix *H,
+         real *b, real tol, real *x, FILE *fout )
 {
     int  i, j, N;
     real tmp, alpha, beta, b_norm;
@@ -643,23 +1867,28 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for( j = 0; j < N; ++j )
+    for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig = Dot( workspace->r, workspace->d, N );
     sig0 = sig;
 
-    for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
+    for ( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i )
+    {
         Sparse_MatVec( H, workspace->d, workspace->q );
 
         sig = Dot( workspace->r, workspace->d, N );
         tmp = Dot( workspace->d, workspace->q, N );
-        alpha = sig / tmp;    
+        alpha = sig / tmp;
 
         Vector_Add( x, alpha, workspace->d, N );
         Vector_Add( workspace->r, -alpha, workspace->q, N );
-        for( j = 0; j < N; ++j )
+        for ( j = 0; j < N; ++j )
+        {
             workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
         //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
@@ -667,10 +1896,55 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 
     fprintf( stderr, "SDM took %d iterations\n", i );
 
-    if( i >= 300 ) {
+    if ( i >= 300 )
+    {
         fprintf( stderr, "SDM convergence failed!\n" );
         return i;
     }
 
     return i;
 }
+
+
+/* Estimate the stability of a 2-side preconditioning scheme
+ * using the factorization A \approx LU. Specifically, estimate the 1-norm of A^{-1}
+ * using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
+ *   1) Ly = e
+ *   2) Ux = y where y = Ux
+ * That is, we seek to solve e = LUx for unknown x
+ *
+ * Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
+ *   for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
+real condest( const sparse_matrix * const L, const sparse_matrix * const U )
+{
+    unsigned int i, N;
+    real *e, c;
+
+    N = L->n;
+
+    if ( (e = (real*) malloc(sizeof(real) * N)) == NULL )
+    {
+        fprintf( stderr, "Not enough memory for condest. Terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    memset( e, 1., N * sizeof(real) );
+
+    tri_solve( L, e, e, LOWER );
+    tri_solve( U, e, e, UPPER );
+
+    /* compute 1-norm of vector e */
+    c = FABS(e[0]);
+    for ( i = 1; i < N; ++i)
+    {
+        if ( FABS(e[i]) > c )
+        {
+            c = FABS(e[i]);
+        }
+
+    }
+
+    free( e );
+
+    return c;
+}
diff --git a/PuReMD-GPU/src/lin_alg.h b/PuReMD-GPU/src/lin_alg.h
index a515a959494a6eca40fe9f338d2a08118ff3e39a..317afbf94cf2a26a4f48be4a0ad9c66bcef42085 100644
--- a/PuReMD-GPU/src/lin_alg.h
+++ b/PuReMD-GPU/src/lin_alg.h
@@ -21,28 +21,31 @@
 #ifndef __LIN_ALG_H_
 #define __LIN_ALG_H_
 
-#define SIGN(x) (x < 0.0 ? -1 : 1);
-
 #include "mytypes.h"
 
 
-int GMRES( static_storage*, sparse_matrix*,
-           real*, real, real*, FILE* , reax_system* );
+void Transpose( const sparse_matrix const *, sparse_matrix const * );
+void Transpose_I( sparse_matrix * const );
 
-int GMRES_HouseHolder( static_storage*, sparse_matrix*,
-                       real*, real, real*, FILE* , reax_system*  );
+sparse_matrix * setup_graph_coloring( sparse_matrix * const );
 
-int PGMRES( static_storage*, sparse_matrix*, real*, real,
-            sparse_matrix*, sparse_matrix*, real*, FILE*, reax_system* );
+int GMRES( const static_storage * const, const control_params * const,
+        simulation_data * const, const sparse_matrix * const,
+        const real * const, const real, real * const,
+        const FILE * const, const int );
 
-int PCG( static_storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, FILE*, reax_system* );
+int GMRES_HouseHolder( const static_storage * const, const control_params * const,
+        simulation_data * const, const sparse_matrix * const,
+        const real * const, const real, real * const,
+        const FILE * const, const int );
 
 int CG( static_storage*, sparse_matrix*,
-        real*, real, real*, FILE*, reax_system* );
+        real*, real, real*, FILE* );
+
+int SDM( static_storage*, sparse_matrix*,
+         real*, real, real*, FILE* );
 
-int uyduruk_GMRES( static_storage*, sparse_matrix*,
-                   real*, real, real*, int, FILE*, reax_system* );
+real condest( const sparse_matrix * const, const sparse_matrix * const );
 
 
 #endif
diff --git a/PuReMD-GPU/src/list.c b/PuReMD-GPU/src/list.c
index c6f0e55ebad4fc59c07f253a1d216d3242115aff..c52a4cc1cf2b2a8c1d32fdda71c8b0aa7808992a 100644
--- a/PuReMD-GPU/src/list.c
+++ b/PuReMD-GPU/src/list.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,9 +22,9 @@
 #include "list.h"
 
 
-char Make_List(int n, int num_intrs, int type, list* l)
+int Make_List( int n, int num_intrs, int type, list* l )
 {
-    char success=1;
+    int ret = SUCCESS;
 
     l->n = n;
     l->num_intrs = num_intrs;
@@ -31,116 +32,170 @@ char Make_List(int n, int num_intrs, int type, list* l)
     l->index = (int*) malloc( n * sizeof(int) );
     l->end_index = (int*) malloc( n * sizeof(int) );
 
-    if (l->index == NULL) success = 0;
-    if (l->end_index == NULL) success = 0;
+    if (l->index == NULL)
+    {
+        ret = FAILURE;
+    }
+    if (l->end_index == NULL)
+    {
+        ret = FAILURE;
+    }
 
     l->type = type;
 
-    switch(type)
+    switch (type)
     {
-        case TYP_VOID:
-            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-            if (l->select.v == NULL) success = 0;
-            break;
-
-        case TYP_THREE_BODY:
-            l->select.three_body_list = (three_body_interaction_data*) 
-                malloc(l->num_intrs*sizeof(three_body_interaction_data));
-            if (l->select.three_body_list == NULL) success = 0;
-            break;
-
-        case TYP_BOND:
-            l->select.bond_list = (bond_data*) 
+    case TYP_VOID:
+        l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
+        if (l->select.v == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_THREE_BODY:
+        l->select.three_body_list = (three_body_interaction_data*)
+                malloc(l->num_intrs * sizeof(three_body_interaction_data));
+        if (l->select.three_body_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_BOND:
+        l->select.bond_list = (bond_data*)
                 malloc(l->num_intrs * sizeof(bond_data));
-            if (l->select.bond_list == NULL) success = 0;
-            break;
-
-        case TYP_DBO:
-            l->select.dbo_list = (dbond_data*) 
+        if (l->select.bond_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_DBO:
+        l->select.dbo_list = (dbond_data*)
                 malloc(l->num_intrs * sizeof(dbond_data));
-            if (l->select.dbo_list == NULL) success = 0;
-            break;
-
-        case TYP_DDELTA:
-            l->select.dDelta_list = (dDelta_data*) 
-                malloc(l->num_intrs*sizeof(dDelta_data));
-            if (l->select.dDelta_list == NULL) success = 0;
-            break;
-
-        case TYP_FAR_NEIGHBOR:
-            l->select.far_nbr_list = (far_neighbor_data*) 
-                malloc(l->num_intrs*sizeof(far_neighbor_data));
-            if (l->select.far_nbr_list == NULL) success = 0;
-            break;
-
-        case TYP_NEAR_NEIGHBOR:
-            l->select.near_nbr_list = (near_neighbor_data*) 
-                malloc(l->num_intrs*sizeof(near_neighbor_data));
-            if (l->select.near_nbr_list == NULL) success = 0;
-            break;
-
-        case TYP_HBOND:
-            l->select.hbond_list = (hbond_data*)
+        if (l->select.dbo_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_DDELTA:
+        l->select.dDelta_list = (dDelta_data*)
+                malloc(l->num_intrs * sizeof(dDelta_data));
+        if (l->select.dDelta_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_FAR_NEIGHBOR:
+        l->select.far_nbr_list = (far_neighbor_data*)
+                malloc(l->num_intrs * sizeof(far_neighbor_data));
+        if (l->select.far_nbr_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_NEAR_NEIGHBOR:
+        l->select.near_nbr_list = (near_neighbor_data*)
+                malloc(l->num_intrs * sizeof(near_neighbor_data));
+        if (l->select.near_nbr_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    case TYP_HBOND:
+        l->select.hbond_list = (hbond_data*)
                 malloc( l->num_intrs * sizeof(hbond_data) );
-            if (l->select.hbond_list == NULL) success = 0;
-            break;            
-
-        default:
-            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-            if (l->select.v == NULL) success = 0;
-            l->type = TYP_VOID;
-            break;      
+        if (l->select.hbond_list == NULL)
+        {
+            ret = FAILURE;
+        }
+        break;
+
+    default:
+        l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
+        if (l->select.v == NULL)
+        {
+            ret = FAILURE;
+        }
+        l->type = TYP_VOID;
+        break;
     }
 
-    return success;
+    return ret;
 }
 
 
-void Delete_List(list* l)
+void Delete_List( list* l )
 {
-    if( l->index != NULL )
+    if ( l->index != NULL )
+    {
         free(l->index);
-    if( l->end_index != NULL )
+    }
+    if ( l->end_index != NULL )
+    {
         free(l->end_index);
+    }
 
-    switch(l->type)
+    switch (l->type)
     {
-        case TYP_VOID:
-            if( l->select.v != NULL )
-                free(l->select.v);
-            break;
-        case TYP_THREE_BODY:
-            if( l->select.three_body_list != NULL )
-                free(l->select.three_body_list);
-            break;
-        case TYP_BOND:
-            if( l->select.bond_list != NULL )
-                free(l->select.bond_list);
-            break;
-        case TYP_DBO:
-            if( l->select.dbo_list != NULL )
-                free(l->select.dbo_list);
-            break;
-        case TYP_DDELTA:
-            if( l->select.dDelta_list != NULL )
-                free(l->select.dDelta_list);
-            break;
-        case TYP_FAR_NEIGHBOR:
-            if( l->select.far_nbr_list != NULL )
-                free(l->select.far_nbr_list);
-            break;
-        case TYP_NEAR_NEIGHBOR:
-            if( l->select.near_nbr_list != NULL )
-                free(l->select.near_nbr_list);
-            break;
-        case TYP_HBOND:
-            if( l->select.hbond_list != NULL )
-                free(l->select.hbond_list);
-            break;
-
-        default:
-            // Report fatal error
-            break;
+    case TYP_VOID:
+        if ( l->select.v != NULL )
+        {
+            free(l->select.v);
+        }
+        break;
+    case TYP_THREE_BODY:
+        if ( l->select.three_body_list != NULL )
+        {
+            free(l->select.three_body_list);
+        }
+        break;
+    case TYP_BOND:
+        if ( l->select.bond_list != NULL )
+        {
+            free(l->select.bond_list);
+        }
+        break;
+    case TYP_DBO:
+        if ( l->select.dbo_list != NULL )
+        {
+            free(l->select.dbo_list);
+        }
+        break;
+    case TYP_DDELTA:
+        if ( l->select.dDelta_list != NULL )
+        {
+            free(l->select.dDelta_list);
+        }
+        break;
+    case TYP_FAR_NEIGHBOR:
+        if ( l->select.far_nbr_list != NULL )
+        {
+            free(l->select.far_nbr_list);
+        }
+        break;
+    case TYP_NEAR_NEIGHBOR:
+        if ( l->select.near_nbr_list != NULL )
+        {
+            free(l->select.near_nbr_list);
+        }
+        break;
+    case TYP_HBOND:
+        if ( l->select.hbond_list != NULL )
+        {
+            free(l->select.hbond_list);
+        }
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized list type. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+        break;
     }
-}
 
+}
diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h
index b90c41419271ca6b859be08ea4005fbe9107c029..5ee4544212218488e6fa84477f8a446f66e73544 100644
--- a/PuReMD-GPU/src/list.h
+++ b/PuReMD-GPU/src/list.h
@@ -24,7 +24,7 @@
 #include "mytypes.h"
 
 
-char Make_List( int, int, int, list* );
+int Make_List( int, int, int, list* );
 void Delete_List( list* );
 
 
diff --git a/PuReMD-GPU/src/lookup.c b/PuReMD-GPU/src/lookup.c
index c439709dc09c77775ed716a39db797fa8c831585..b67bf5b7b96e91562a34ab2af3bbe421a1f5c19c 100644
--- a/PuReMD-GPU/src/lookup.c
+++ b/PuReMD-GPU/src/lookup.c
@@ -1,28 +1,28 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "lookup.h"
 
-#include "two_body_interactions.h"
-
 #include "index_utils.h"
+#include "two_body_interactions.h"
 
 
 void Make_Lookup_Table(real xmin, real xmax, int n,
@@ -33,44 +33,48 @@ void Make_Lookup_Table(real xmin, real xmax, int n,
     t->xmin = xmin;
     t->xmax = xmax;
     t->n = n;
-    t->dx = (xmax - xmin)/(n-1);
+    t->dx = (xmax - xmin) / (n - 1);
     t->inv_dx = 1.0 / t->dx;
-    t->a = (n-1)/(xmax-xmin);
-    t->y = (real*) malloc(n*sizeof(real));
+    t->a = (n - 1) / (xmax - xmin);
+    t->y = (real*) malloc(n * sizeof(real));
 
-    for(i=0; i < n; i++)
-        t->y[i] = f(i*t->dx + t->xmin);
+    for (i = 0; i < n; i++)
+        t->y[i] = f(i * t->dx + t->xmin);
 
-    // //fprintf(stdout,"dx = %lf\n",t->dx);
+    // fprintf(stdout,"dx = %lf\n",t->dx);
     // for(i=0; i < n; i++)
-    //   //fprintf( stdout,"%d %lf %lf %lf\n", 
+    //   fprintf( stdout,"%d %lf %lf %lf\n",
     //            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
 }
 
 
 /* Fills solution into x. Warning: will modify c and d! */
 void Tridiagonal_Solve( const real *a, const real *b,
-        real *c, real *d, real *x, unsigned int n){
+        real *c, real *d, real *x, unsigned int n)
+{
     int i;
     real id;
 
     /* Modify the coefficients. */
-    c[0] /= b[0];    /* Division by zero risk. */
-    d[0] /= b[0];    /* Division by zero would imply a singular matrix. */
-    for(i = 1; i < n; i++){
-        id = (b[i] - c[i-1] * a[i]);  /* Division by zero risk. */
-        c[i] /= id;            /* Last value calculated is redundant. */
-        d[i] = (d[i] - d[i-1] * a[i])/id;
+    c[0] /= b[0]; /* Division by zero risk. */
+    d[0] /= b[0]; /* Division by zero would imply a singular matrix. */
+    for (i = 1; i < n; i++)
+    {
+        id = (b[i] - c[i - 1] * a[i]); /* Division by zero risk. */
+        c[i] /= id;         /* Last value calculated is redundant. */
+        d[i] = (d[i] - d[i - 1] * a[i]) / id;
     }
 
     /* Now back substitute. */
     x[n - 1] = d[n - 1];
-    for(i = n - 2; i >= 0; i--)
+    for (i = n - 2; i >= 0; i--)
+    {
         x[i] = d[i] - c[i] * x[i + 1];
+    }
 }
 
 
-void Natural_Cubic_Spline( const real *h, const real *f, 
+void Natural_Cubic_Spline( const real *h, const real *f,
         cubic_spline_coef *coef, unsigned int n )
 {
     int i;
@@ -84,43 +88,53 @@ void Natural_Cubic_Spline( const real *h, const real *f,
     v = (real*) malloc( n * sizeof(real) );
 
     /* build the linear system */
-    a[0] = a[1] = a[n-1] = 0;
-    for( i = 2; i < n-1; ++i )
-        a[i] = h[i-1];
+    a[0] = a[1] = a[n - 1] = 0;
+    for ( i = 2; i < n - 1; ++i )
+    {
+        a[i] = h[i - 1];
+    }
 
-    b[0] = b[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
+    b[0] = b[n - 1] = 0;
+    for ( i = 1; i < n - 1; ++i )
+    {
+        b[i] = 2 * (h[i - 1] + h[i]);
+    }
 
-    c[0] = c[n-2] = c[n-1] = 0;
-    for( i = 1; i < n-2; ++i )
+    c[0] = c[n - 2] = c[n - 1] = 0;
+    for ( i = 1; i < n - 2; ++i )
+    {
         c[i] = h[i];
+    }
 
-    d[0] = d[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    d[0] = d[n - 1] = 0;
+    for ( i = 1; i < n - 1; ++i )
+    {
+        d[i] = 6 * ((f[i + 1] - f[i]) / h[i] - (f[i] - f[i - 1]) / h[i - 1]);
+    }
 
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
+    /*fprintf( stderr, "i  a        b        c        d\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+      fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
     v[0] = 0;
-    v[n-1] = 0;
-    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
+    v[n - 1] = 0;
+    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n - 2 );
+
+    for ( i = 1; i < n; ++i )
+    {
+        coef[i - 1].d = (v[i] - v[i - 1]) / (6 * h[i - 1]);
+        coef[i - 1].c = v[i] / 2;
+        coef[i - 1].b = (f[i] - f[i - 1]) / h[i - 1] + h[i - 1] * (2 * v[i] + v[i - 1]) / 6;
+        coef[i - 1].a = f[i];
     }
 
-    /*//fprintf( stderr, "i  v  coef\n" );
+    /*fprintf( stderr, "i  v  coef\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+      fprintf( stderr, "%d  %f  %f  %f  %f  %f\n",
+      i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
+
 void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
         cubic_spline_coef *coef, unsigned int n )
 {
@@ -136,39 +150,48 @@ void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
 
     /* build the linear system */
     a[0] = 0;
-    for( i = 1; i < n; ++i )
-        a[i] = h[i-1];
+    for ( i = 1; i < n; ++i )
+    {
+        a[i] = h[i - 1];
+    }
 
-    b[0] = 2*h[0];
-    for( i = 1; i < n; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
+    b[0] = 2 * h[0];
+    for ( i = 1; i < n; ++i )
+    {
+        b[i] = 2 * (h[i - 1] + h[i]);
+    }
 
-    c[n-1] = 0;
-    for( i = 0; i < n-1; ++i )
+    c[n - 1] = 0;
+    for ( i = 0; i < n - 1; ++i )
+    {
         c[i] = h[i];
+    }
 
-    d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
-    d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    d[0] = 6 * (f[1] - f[0]) / h[0] - 6 * v0;
+    d[n - 1] = 6 * vlast - 6 * (f[n - 1] - f[n - 2] / h[n - 2]);
+    for ( i = 1; i < n - 1; ++i )
+    {
+        d[i] = 6 * ((f[i + 1] - f[i]) / h[i] - (f[i] - f[i - 1]) / h[i - 1]);
+    }
 
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
+    /*fprintf( stderr, "i  a        b        c        d\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+      fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
     Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
     // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
 
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
+    for ( i = 1; i < n; ++i )
+    {
+        coef[i - 1].d = (v[i] - v[i - 1]) / (6 * h[i - 1]);
+        coef[i - 1].c = v[i] / 2;
+        coef[i - 1].b = (f[i] - f[i - 1]) / h[i - 1] + h[i - 1] * (2 * v[i] + v[i - 1]) / 6;
+        coef[i - 1].a = f[i];
     }
 
-    /*//fprintf( stderr, "i  v  coef\n" );
+    /*fprintf( stderr, "i  v  coef\n" );
       for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+      fprintf( stderr, "%d  %f  %f  %f  %f  %f\n",
+      i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
@@ -178,21 +201,24 @@ void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
     real base, dif;
 
     i = (int)(r * t->inv_dx);
-    if( i == 0 )  ++i;
-    base = (real)(i+1) * t->dx;
+    if ( i == 0 )
+    {
+        ++i;
+    }
+    base = (real)(i + 1) * t->dx;
     dif = r - base;
-    ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
+    //fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
 
-    y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
-        t->vdW[i].a;
-    y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
-            t->CEvd[i].b)*dif + t->CEvd[i].a;
+    y->e_vdW = ((t->vdW[i].d * dif + t->vdW[i].c) * dif + t->vdW[i].b) * dif +
+               t->vdW[i].a;
+    y->CEvd = ((t->CEvd[i].d * dif + t->CEvd[i].c) * dif +
+               t->CEvd[i].b) * dif + t->CEvd[i].a;
     //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
 
-    y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
-        t->ele[i].a;
-    y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
-        t->CEclmb[i].a;
+    y->e_ele = ((t->ele[i].d * dif + t->ele[i].c) * dif + t->ele[i].b) * dif +
+               t->ele[i].a;
+    y->CEclmb = ((t->CEclmb[i].d * dif + t->CEclmb[i].c) * dif + t->CEclmb[i].b) * dif +
+                t->CEclmb[i].a;
 
     y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
     //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
@@ -221,147 +247,158 @@ void Make_LR_Lookup_Table( reax_system *system, control_params *control )
 
     num_atom_types = system->reaxprm.num_atom_types;
     dr = control->r_cut / control->tabulate;
-    h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-
-    /* allocate Long-Range LookUp Table space based on 
+    h = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fh = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fvdw = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fCEvd = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fele = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+    fCEclmb = (real*) malloc( (control->tabulate + 1) * sizeof(real) );
+
+    /* allocate Long-Range LookUp Table space based on
        number of atom types in the ffield file */
-    //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
-    //for( i = 0; i < num_atom_types; ++i )
-    // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
-
-    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
+    LR = (LR_lookup_table*) malloc( num_atom_types * num_atom_types * sizeof(LR_lookup_table) );
 
     /* most atom types in ffield file will not exist in the current
        simulation. to avoid unnecessary lookup table space, determine
        the atom types that exist in the current simulation */
-    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+    for ( i = 0; i < MAX_ATOM_TYPES; ++i )
+    {
         existing_types[i] = 0;
-    for( i = 0; i < system->N; ++i )
+    }
+    for ( i = 0; i < system->N; ++i )
+    {
         existing_types[ system->atoms[i].type ] = 1;
+    }
 
     /* fill in the lookup table entries for existing atom types.
        only lower half should be enough. */
-    for( i = 0; i < num_atom_types; ++i )
-        if( existing_types[i] )
-            for( j = i; j < num_atom_types; ++j )
-                if( existing_types[j] ) {
-                    LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
-                    LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
-                    LR[ index_lr (i,j,num_atom_types) ].dx = dr;
-                    LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
-                    LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-
-                    for( r = 1; r <= control->tabulate; ++r ) {
-                        LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
-                        h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
-                        fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
-                        fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
-                        fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                        fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
-                        fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-
-                        if( r == 1 ){
-                            v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+    for ( i = 0; i < num_atom_types; ++i )
+    {
+        if ( existing_types[i] )
+        {
+            for ( j = i; j < num_atom_types; ++j )
+            {
+                if ( existing_types[j] )
+                {
+                    LR[ index_lr(i,j,num_atom_types) ].xmin = 0;
+                    LR[ index_lr(i,j,num_atom_types) ].xmax = control->r_cut;
+                    LR[ index_lr(i,j,num_atom_types) ].n = control->tabulate + 1;
+                    LR[ index_lr(i,j,num_atom_types) ].dx = dr;
+                    LR[ index_lr(i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
+                    LR[ index_lr(i,j,num_atom_types) ].y = (LR_data*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(LR_data) );
+                    LR[ index_lr(i,j,num_atom_types) ].H = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].vdW = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].CEvd = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].ele = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+                    LR[ index_lr(i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*)
+                            malloc( LR[index_lr(i,j,num_atom_types)].n * sizeof(cubic_spline_coef) );
+
+                    for ( r = 1; r <= control->tabulate; ++r )
+                    {
+                        LR_vdW_Coulomb( system, control, i, j, r * dr,
+                                &(LR[ index_lr(i,j,num_atom_types) ].y[r]) );
+                        h[r] = LR[ index_lr(i,j,num_atom_types) ].dx;
+                        fh[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].H;
+                        fvdw[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].e_vdW;
+                        fCEvd[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                        fele[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].e_ele;
+                        fCEclmb[r] = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
+
+                        if ( r == 1 )
+                        {
+                            v0_vdw = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                            v0_ele = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
                         }
-                        else if( r == control->tabulate ){
-                            vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        else if ( r == control->tabulate )
+                        {
+                            vlast_vdw = LR[ index_lr(i,j,num_atom_types) ].y[r].CEvd;
+                            vlast_ele = LR[ index_lr(i,j,num_atom_types) ].y[r].CEclmb;
                         }
                     }
 
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
-                    Natural_Cubic_Spline( &h[1], &fh[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
+                    Natural_Cubic_Spline( &h[1], &fh[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].H[1]), control->tabulate + 1 );
 
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
-                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
-
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
+                      fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
+                    */
+                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw,
+                            &(LR[ index_lr(i,j,num_atom_types) ].vdW[1]), control->tabulate + 1 );
+                    Natural_Cubic_Spline( &h[1], &fCEvd[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].CEvd[1]), control->tabulate + 1 );
+
+                    /*fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
                       for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
-                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
+                      fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
+                      fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
+                    */
+                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele,
+                            &(LR[ index_lr(i,j,num_atom_types) ].ele[1]), control->tabulate + 1 );
+                    Natural_Cubic_Spline( &h[1], &fCEclmb[1],
+                            &(LR[ index_lr(i,j,num_atom_types) ].CEclmb[1]), control->tabulate + 1 );
                 }
+            }
+        }
+    }
 
     /***** //test LR-Lookup table
-      evdw_maxerr = 0;
-      eele_maxerr = 0;
-      for( i = 0; i < num_atom_types; ++i )
-      if( existing_types[i] )
-      for( j = i; j < num_atom_types; ++j )
-      if( existing_types[j] ) {
-      for( r = 1; r <= 100; ++r ) {
-      rand_dist = (real)rand()/RAND_MAX * control->r_cut;
-      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
-      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
-
-      evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
-      evdw_relerr = fabs(evdw_abserr / y.e_vdW);
-      fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
-      fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
-      eele_abserr = fabs(y.e_ele - y_spline.e_ele);
-      eele_relerr = fabs(eele_abserr / y.e_ele);
-      fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
-      fele_relerr = fabs(fele_abserr / y.CEclmb);
-
-      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
-    //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.H, y_spline.H, 
-    fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );  
-    
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); 
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); 
-    
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); 
-    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-    y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); 
-    }
-    
-    if( evdw_relerr > evdw_maxerr )
-    evdw_maxerr = evdw_relerr;
-    if( eele_relerr > eele_maxerr )
-    eele_maxerr = eele_relerr;
-    }
-    }
-    //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
-    //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
-         *******/
-    
+     evdw_maxerr = 0;
+     eele_maxerr = 0;
+     for( i = 0; i < num_atom_types; ++i )
+     if( existing_types[i] )
+     for( j = i; j < num_atom_types; ++j )
+     if( existing_types[j] ) {
+     for( r = 1; r <= 100; ++r ) {
+     rand_dist = (real)rand()/RAND_MAX * control->r_cut;
+     LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
+     LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
+
+     evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
+     evdw_relerr = fabs(evdw_abserr / y.e_vdW);
+     fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
+     fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
+     eele_abserr = fabs(y.e_ele - y_spline.e_ele);
+     eele_relerr = fabs(eele_abserr / y.e_ele);
+     fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
+     fele_relerr = fabs(fele_abserr / y.CEclmb);
+
+     if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
+     fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.H, y_spline.H,
+     fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );
+
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr );
+
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+     y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr );
+     fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+             y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr );
+             }
+
+             if( evdw_relerr > evdw_maxerr )
+             evdw_maxerr = evdw_relerr;
+             if( eele_relerr > eele_maxerr )
+             eele_maxerr = eele_relerr;
+             }
+             }
+             fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
+             fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
+    *******/
+
     free(h);
     free(fh);
     free(fvdw);
@@ -383,24 +420,26 @@ real Lookup( real x, lookup_table* t )
     real b;
     int i;
 
-    /* if ( x < t->xmin) 
-       {
-    //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
-    exit(0);
+    /*
+    if ( x < t->xmin)
+    {
+       fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
+       exit(0);
     }
-    if ( x > t->xmax) 
+    if ( x > t->xmax)
     {
-    //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
-    exit(0);
-    } */
+       fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
+       exit(0);
+    }
+    */
 
     i = Lookup_Index_Of( x, t );
     x1 = i * t->dx + t->xmin;
-    x2 = (i+1) * t->dx + t->xmin;
+    x2 = (i + 1) * t->dx + t->xmin;
 
-    b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
-    // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
+    b = ( x2 * t->y[i] - x1 * t->y[i + 1] ) * t->inv_dx;
+    // fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
     //          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
 
-    return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
+    return t->inv_dx * ( t->y[i + 1] - t->y[i] ) * x + b;
 }
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index 0eb1856a578df369aba28d8dbfb55dcf57348e9d..b04a9de39c8dbf487cac6d02a5e098a220862d5a 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -28,12 +28,6 @@
     #define GLOBAL __global__
     #define HOST_DEVICE __host__ __device__
 
-    #include <cuda_runtime.h>
-    #include <cuda.h>
-    #include <cuda_runtime_api.h>
-
-    #include <cublas_v2.h>
-    #include <cusparse_v2.h>
     #if __CUDA_ARCH__ < 600
       #define MYATOMICADD myAtomicAdd
     #else
@@ -55,14 +49,25 @@
   #include "config.h"
 #endif
 
-#include "math.h"
-//#include "random.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "sys/time.h"
-#include "time.h"
-#include "zlib.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+#ifdef _OPENMP
+  #include <omp.h>
+#endif
+
+#ifdef HAVE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+  #include <cuda_runtime_api.h>
+
+  #include <cublas_v2.h>
+  #include <cusparse_v2.h>
+#endif
 
 //#define DEBUG_FOCUS
 //#define TEST_FORCES
@@ -75,6 +80,7 @@
 #define TRUE  1
 #define FALSE 0
 
+#define LOG    log
 #define EXP    exp
 #define SQRT   sqrt
 #define POW    pow
@@ -82,6 +88,8 @@
 #define COS    cos
 #define SIN    sin
 #define TAN    tan
+#define FABS   fabs
+#define FMOD   fmod
 
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
@@ -90,6 +98,15 @@
 #define MAX( x, y )   (((x) > (y)) ? (x) : (y))
 #define MIN( x, y )   (((x) < (y)) ? (x) : (y))
 
+/* NaN IEEE 754 representation for C99 in math.h
+ * Note: function choice must match REAL typedef below */
+#ifdef NAN
+  #define IS_NAN_REAL(a) (isnan(a))
+#else
+  #warn "No support for NaN"
+  #define NAN_REAL(a) (0)
+#endif
+
 #define PI            3.14159265
 #define C_ele          332.06371
 //#define K_B         503.398008   // kcal/mol/K
@@ -106,7 +123,11 @@
 #define AVOGNR          6.0221367e23
 #define P_CONV          1.0e-24 * AVOGNR * JOULES_to_CAL
 
-#define MAX_STR             100      // MAX STRing length (used for naming)
+#define MAX_STR             1024
+#define MAX_LINE            1024
+#define MAX_TOKENS          1024
+#define MAX_TOKEN_LEN       1024
+
 #define MAX_ATOM_ID         100000
 #define MAX_RESTRICT        15
 #define MAX_MOLECULE_SIZE   20
@@ -124,23 +145,7 @@
 #define MAX_ITR             10
 #define RESTART             50
 
-#define FILE_NOT_FOUND_ERR    10
-#define UNKNOWN_ATOM_TYPE_ERR 11
-#define CANNOT_OPEN_OUTFILE   12
-#define INIT_ERR              13
-#define INSUFFICIENT_SPACE    14
-#define UNKNOWN_OPTION        15
-#define INVALID_INPUT         16
-
-#define C_ATOM  0
-#define H_ATOM  1
-#define O_ATOM  2
-#define N_ATOM  3
-#define S_ATOM  4
-#define SI_ATOM 5
-#define GE_ATOM 6
-#define X_ATOM  7
-
+/* tolerance used for validating GPU results against host */
 #define GPU_TOLERANCE   1e-5
 
 #define ZERO           0.000000000000000e+00
@@ -157,6 +162,7 @@
 #define DANGER_ZONE 0.95
 #define LOOSE_ZONE  0.75
 
+//TODO: make enum
 #define RES_GRID_ATOMS      0x01
 #define RES_GRID_TOP        0x02
 #define RES_GRID_MARK       0x03
@@ -165,17 +171,21 @@
 #define RES_GRID_NBRS       0x06
 #define RES_GRID_NBRS_CP    0x07
 
+//TODO: make enum
 #define RES_SYSTEM_ATOMS            0x10
 #define RES_SYSTEM_SIMULATION_BOX   0x11
 
+//TODO: make enum
 #define RES_REAX_INT_SBP    0x20
 #define RES_REAX_INT_TBP    0x21
 #define RES_REAX_INT_THBP   0x22
 #define RES_REAX_INT_HBP    0x23
 #define RES_REAX_INT_FBP    0x24
 
+//TODO: make enum
 #define RES_SIMULATION_DATA 0x30
 
+//TODO: make enum
 #define RES_STORAGE                    0x401
 #define RES_STORAGE_HBOND_INDEX        0x402
 #define RES_STORAGE_TOTAL_BOND_ORDER   0x403
@@ -229,13 +239,17 @@
 #define RES_STORAGE_RESTRICTED_LIST    0x432
 #define RES_STORAGE_ORIG_ID                0x433
 
+//TODO: make enum
 #define RES_CONTROL_PARAMS  0x50
 
+//TODO: make enum
 #define RES_GLOBAL_PARAMS       0x60
 
+//TODO: make enum
 #define RES_SPARSE_MATRIX_INDEX     0x70
 #define RES_SPARSE_MATRIX_ENTRY     0x71
 
+//TODO: make enum
 #define RES_LR_LOOKUP_Y             0x80
 #define RES_LR_LOOKUP_H             0x81
 #define RES_LR_LOOKUP_VDW               0x82
@@ -244,6 +258,7 @@
 #define RES_LR_LOOKUP_CECLMB            0x85
 #define RES_LR_LOOKUP_TABLE         0x86
 
+//TODO: make enum
 #define RES_SCRATCH                     0x90
 
 #define LIST_INDEX                      0x00
@@ -314,17 +329,78 @@ typedef real rvec[3];
 typedef int  ivec[3];
 typedef real rtensor[3][3];
 
-enum {NVE, NVT, NPT, sNPT, iNPT, ensNR, bNVT};
-enum {FAR_NBRS, NEAR_NBRS, THREE_BODIES, BONDS, OLD_BONDS,
-      HBONDS, DBO, DDELTA, LIST_N
-     };
-enum {TYP_VOID, TYP_THREE_BODY, TYP_BOND, TYP_HBOND, TYP_DBO,
-      TYP_DDELTA, TYP_FAR_NEIGHBOR, TYP_NEAR_NEIGHBOR, TYP_N
-     };
-enum {UNKNOWN, WATER};
-enum {NO_ANALYSIS, FRAGMENTS, REACTIONS, NUM_ANALYSIS};
-enum {WRITE_ASCII, WRITE_BINARY, RF_N};
-enum {XYZ, PDB, BGF, ASCII_RESTART, BINARY_RESTART, GF_N};
+/* config params */
+enum ensemble
+{
+    NVE = 0, NVT = 1, NPT = 2, sNPT = 3, iNPT = 4, ensNR = 5, bNVT = 6,
+};
+
+enum interaction_list_offets
+{
+    FAR_NBRS = 0, NEAR_NBRS = 1, THREE_BODIES = 2, BONDS = 3, OLD_BONDS = 4,
+    HBONDS = 5, DBO = 6, DDELTA = 7, LIST_N = 8,
+};
+
+enum interaction_type
+{
+    TYP_VOID = 0, TYP_THREE_BODY = 1, TYP_BOND = 2, TYP_HBOND = 3, TYP_DBO = 4,
+    TYP_DDELTA = 5, TYP_FAR_NEIGHBOR = 6, TYP_NEAR_NEIGHBOR = 7, TYP_N = 8,
+};
+
+enum errors
+{
+    FILE_NOT_FOUND = -10,
+    UNKNOWN_ATOM_TYPE = -11,
+    CANNOT_OPEN_FILE = -12,
+    CANNOT_INITIALIZE = -13,
+    INSUFFICIENT_MEMORY = -14,
+    UNKNOWN_OPTION = -15,
+    INVALID_INPUT = -16,
+    INVALID_GEO = -17,
+    NUMERIC_BREAKDOWN = -18,
+    RUNTIME_ERROR = -19,
+};
+
+enum atoms
+{
+    C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
+    S_ATOM = 4, SI_ATOM = 5, GE_ATOM = 6, X_ATOM = 7,
+};
+
+enum molecule_type
+{
+    UNKNOWN = 0, WATER = 1,
+};
+
+enum molecular_analysis_type
+{
+    NO_ANALYSIS = 0, FRAGMENTS = 1, REACTIONS = 2, NUM_ANALYSIS = 3,
+};
+
+enum restart_format
+{
+    WRITE_ASCII = 0, WRITE_BINARY = 1, RF_N = 2,
+};
+
+enum geo_formats
+{
+    CUSTOM = 0, PDB = 1, BGF = 2, ASCII_RESTART = 3, BINARY_RESTART = 4, GF_N = 5,
+};
+
+enum solver
+{
+    GMRES_S = 0, GMRES_H_S = 1, CG_S = 2, SDM_S = 3,
+};
+
+enum pre_comp
+{
+    DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILUT_PAR_PC = 3, ILU_SUPERLU_MT_PC = 4,
+};
+
+enum pre_app
+{
+    NONE_PA = 0, TRI_SOLVE_PA = 1, TRI_SOLVE_LEVEL_SCHED_PA = 2, TRI_SOLVE_GC_PA = 3, JACOBI_ITER_PA = 4,
+};
 
 
 /* Global params mapping */
@@ -502,33 +578,36 @@ typedef struct
 {
     int num_atom_types;
     global_parameters gp;
-    global_parameters d_gp;
-
     single_body_parameters *sbp;
-    single_body_parameters *d_sbp;
-
     two_body_parameters *tbp;
-    two_body_parameters *d_tbp;
-
     three_body_header *thbp;
-    three_body_header *d_thbp;
-
     hbond_parameters *hbp;
-    hbond_parameters *d_hbp;
-
     four_body_header *fbp;
-    four_body_header *d_fbp;
 
+#ifdef HAVE_CUDA
+    global_parameters d_gp;
+    single_body_parameters *d_sbp;
+    two_body_parameters *d_tbp;
+    three_body_header *d_thbp;
+    hbond_parameters *d_hbp;
+    four_body_header *d_fbp;
+#endif
 } reax_interaction;
 
 
 typedef struct
 {
-    rvec x;        /* Position, velocity, force on atom */
+    /* Position, velocity, force on atom */
+    rvec x;
     rvec v;
     rvec f;
-    real q;              /* Charge on the atom */
-    int  type;           /* Type of this atom */
+
+    /* Charge on the atom */
+    real q;
+
+    /* Type of this atom */
+    int type;
+
     char name[5];
     char spare[7];
 } reax_atom;
@@ -561,9 +640,6 @@ typedef struct
     rvec len;
     rvec inv_len;
 
-    //CUDA
-    int    max_cuda_nbrs; //TODO remove this not used anymore
-
     int   *atoms;
     int   *top;
     int   *mark;
@@ -578,7 +654,16 @@ typedef struct
 {
     int N;
 
-    //CUDA
+    reax_atom *atoms;
+    reax_interaction reaxprm;
+    simulation_box box;
+    grid g;
+
+#ifdef HAVE_CUDA
+    reax_atom *d_atoms;
+    simulation_box *d_box;
+    grid d_g;
+
     //int max_thb_intrs;
     int max_sparse_matrix_entries;
     int num_nbrs;
@@ -586,17 +671,7 @@ typedef struct
     int num_hbonds;
     int num_thbodies;
     int init_thblist;
-
-    reax_atom *atoms;
-    reax_atom *d_atoms;
-
-    reax_interaction reaxprm;
-
-    simulation_box box;
-    simulation_box *d_box;
-
-    grid g;
-    grid d_g;
+#endif
 } reax_system;
 
 
@@ -616,23 +691,22 @@ typedef struct
        2 : NPT  (Parrinello-Rehman-Nose-Hoover) Anisotropic
        3 : sNPT (Parrinello-Rehman-Nose-Hoover) semiisotropic
        4 : iNPT (Parrinello-Rehman-Nose-Hoover) isotropic */
-    int  ensemble;
-    int  nsteps;
-    int  periodic_boundaries;
-    int  restrict_bonds;
-    int  tabulate;
+    int ensemble;
+    int nsteps;
+    int periodic_boundaries;
+    int restrict_bonds;
+    int tabulate;
     ivec periodic_images;
     real dt;
 
     int reneighbor;
     real vlist_cut;
     real nbr_cut;
-    real r_cut, r_low; // upper and lower taper
+    real r_cut, r_sp_cut, r_low; // upper, reduced upper, and lower taper
     real bo_cut;
     real thb_cut;
     real hb_cut;
     real Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
-    real q_err;
     int  max_far_nbrs;
 
     real T_init, T_final, T;
@@ -656,16 +730,26 @@ typedef struct
     int freq_diffusion_coef;
     int restrict_type;
 
-    int refactor;
-    real droptol;
+    unsigned int qeq_solver_type;
+    real qeq_solver_q_err;
+    real qeq_domain_sparsity;
+    unsigned int qeq_domain_sparsify_enabled;
+    unsigned int pre_comp_type;
+    unsigned int pre_comp_refactor;
+    real pre_comp_droptol;
+    unsigned int pre_comp_sweeps;
+    unsigned int pre_app_type;
+    unsigned int pre_app_jacobi_iters;
 
     int molec_anal;
     int freq_molec_anal;
     real bg_cut;
     int num_ignored;
-    int  ignore[MAX_ATOM_TYPES];
+    int ignore[MAX_ATOM_TYPES];
 
+#ifdef HAVE_CUDA
     void *d_control;
+#endif
 } control_params;
 
 
@@ -720,7 +804,14 @@ typedef struct
     real bonded;
     real nonb;
     real QEq;
-    int  matvecs;
+    real QEq_sort_mat_rows;
+    real pre_comp;
+    real pre_app;
+    int solver_iters;
+    real solver_spmv;
+    real solver_vector_ops;
+    real solver_orthog;
+    real solver_tri_solve;
 } reax_timing;
 
 
@@ -776,9 +867,11 @@ typedef struct
     rvec tot_press;
 
     reax_timing timing;
-    //CUDA
+
+#ifdef HAVE_CUDA
     reax_timing d_timing;
     void *d_simulation_data;
+#endif
 } simulation_data;
 
 
@@ -789,8 +882,9 @@ typedef struct
     real theta, cos_theta;
     rvec dcos_di, dcos_dj, dcos_dk;
 
-    //CUDA
+#ifdef HAVE_CUDA
     int i, j, k;
+#endif
 } three_body_interaction_data;
 
 
@@ -813,9 +907,11 @@ typedef struct
     rvec dvec;
     // real H; //, Tap, inv_dr3gamij_1, inv_dr3gamij_3;
 
-    //CUDA
+#ifdef HAVE_CUDA
     //int sym_index;
     //rvec h_f;
+#endif
+
     char spare[16];
 } far_neighbor_data;
 
@@ -868,6 +964,7 @@ typedef struct
     rvec dvec;
     bond_order_data bo_data;
 
+#ifdef HAVE_CUDA
     //single body -- lone pair
     real scratch;
 
@@ -887,42 +984,47 @@ typedef struct
 
     //compute_total_forces
     rvec t_f;
+#endif
 } bond_data;
 
 
+/* compressed row storage (crs) format
+ * See, e.g.,
+ *   http://netlib.org/linalg/html_templates/node91.html#SECTION00931100000000000000
+ *
+ *   m: number of nonzeros (NNZ) ALLOCATED
+ *   n: number of rows
+ *   start: row pointer (last element contains ACTUAL NNZ)
+ *   j: column index for corresponding matrix entry
+ *   val: matrix entry
+ * */
 typedef struct
 {
-    int j;
-    real val;
-} sparse_matrix_entry;
-
-
-typedef struct
-{
-    int n, m;
-    int *start;
-    //CUDA
-    int *end;
-    sparse_matrix_entry *entries;
-
-    int *j;
+    unsigned int n, m;
+    unsigned int *start;
+#ifdef HAVE_CUDA
+    unsigned int *end;
+#endif
+    unsigned int *j;
     real *val;
-
 } sparse_matrix;
 
 
 typedef struct
 {
-    int estimate_nbrs;
     int num_far;
     int Htop;
     int hbonds;
     int num_hbonds;
     int bonds;
     int num_bonds;
-    int thbody;
     int num_3body;
     int gcell_atoms;
+
+#ifdef HAVE_CUDA
+    int estimate_nbrs;
+    int thbody;
+#endif
 } reallocate_data;
 
 
@@ -937,7 +1039,7 @@ typedef struct
     rvec *dDeltap_self;
 
     /* QEq storage */
-    sparse_matrix H, L, U;
+    sparse_matrix *H, *H_sp, *L, *U;
     real *droptol;
     real *w;
     real *Hdia_inv;
@@ -990,6 +1092,7 @@ typedef struct
 } static_storage;
 
 
+/* interaction lists */
 typedef struct
 {
     int n;
@@ -1127,25 +1230,25 @@ typedef void (*evolve_function)(reax_system*, control_params*,
         list**, output_controls*);
 
 typedef real (*lookup_function)(real);
-extern lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
 
+extern lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
 extern LR_lookup_table *LR;
 
-
 typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int*);
+        control_params*, far_neighbor_data*, int*);
+
+extern reax_timing d_timing;
 
-/* CUDA structures */
+#ifdef HAVE_CUDA
 extern list *dev_lists;
 extern static_storage *dev_workspace;
 extern LR_lookup_table *d_LR;
-extern reax_timing d_timing;
 
-//Scratch Pad usage.
+/* scratch Pad usage */
 extern void *scratch;
 extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 extern int MATVEC_BLOCKS;
+#endif
 
 
 #endif
diff --git a/PuReMD-GPU/src/neighbors.c b/PuReMD-GPU/src/neighbors.c
index 5f425e672080d2d4a272f7aca1859c45d8dde17d..7a005f081d57e26cc86bde6501e72bd64d6bc2cc 100644
--- a/PuReMD-GPU/src/neighbors.c
+++ b/PuReMD-GPU/src/neighbors.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -26,51 +27,11 @@
 #include "list.h"
 #include "reset_utils.h"
 #include "system_props.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-        real cutoff, far_neighbor_data *data )
-{
-    real norm_sqr, d, tmp;
-    int i;
-
-    norm_sqr = 0;
-
-    for( i = 0; i < 3; i++ ) { 
-        d = x2[i] - x1[i];
-        tmp = SQR(d);
-
-        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-            if( x2[i] > x1[i] ) { 
-                d -= box->box_norms[i];
-                data->rel_box[i] = -1; 
-            }   
-            else {
-                d += box->box_norms[i];
-                data->rel_box[i] = +1; 
-            }   
-
-            data->dvec[i] = d;
-            norm_sqr += SQR(d);
-        }   
-        else {
-            data->dvec[i] = d;
-            norm_sqr += tmp;
-            data->rel_box[i] = 0;
-        }   
-    }
-
-    if( norm_sqr <= SQR(cutoff) ){
-        data->d = sqrt(norm_sqr);
-        return 1;
-    }
-
-    return 0;
-}
-
-
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -86,54 +47,61 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     far_neighbor_data *nbr_data;
     real t_start, t_elapsed;
 
+    t_start = Get_Time( );
     // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->g );
     far_nbrs = (*lists) + FAR_NBRS;
     Bin_Atoms( system, workspace );
-
-    t_start = Get_Time( );
-
     // fprintf( stderr, "atoms sorted - " );
     num_far = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
+                nbrs = &g->nbrs[ index_grid_nbrs(i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs(i,j,k,0,g) ];
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
+                for(l = 0; l < g->top[ index_grid_3d(i,j,k,g) ]; ++l )
+                {
+                    atom1 = g->atoms[ index_grid_atoms(i,j,k,l,g) ];
                     Set_Start_Index( atom1, num_far, far_nbrs );
                     //fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[ index_grid_3d (x,y,z,g) ];
+                        if ( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                                SQR(control->vlist_cut) )
+                        {
+                            nbr_atoms = &g->atoms[ index_grid_atoms(x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d(x,y,z,g) ];
                             //fprintf( stderr, "\t\tmax: %d\n", max );
 
                             /* pick up another atom from the neighbor cell */
-                            for( m = 0; m < max; ++m ) {
+                            for ( m = 0; m < max; ++m )
+                            {
                                 atom2 = nbr_atoms[m];
-                                if( atom1 > atom2 ) {
+                                if ( atom1 > atom2 )
+                                {
                                     nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                nbr_data)) {
+                                    //fprintf (stderr, " %f %f %f \n", nbr_data->dvec[0], nbr_data->dvec[1], nbr_data->dvec[2]);
+                                    if (Are_Far_Neighbors(system->atoms[atom1].x,
+                                                          system->atoms[atom2].x,
+                                                          &(system->box), control->vlist_cut,
+                                                          nbr_data))
+                                    {
                                         nbr_data->nbr = atom2;
-
                                         ++num_far;
                                     }
                                 }
@@ -144,20 +112,22 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                     }
 
                     Set_End_Index( atom1, num_far, far_nbrs );
-                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
+                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n",
                     //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-                    //  itr); 
+                    //  itr);
                 }
             }
+        }
+    }
 
-    fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
-
-    if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
+    if ( num_far > far_nbrs->num_intrs * DANGER_ZONE )
+    {
         workspace->realloc.num_far = num_far;
-        if( num_far > far_nbrs->num_intrs ){
+        if ( num_far > far_nbrs->num_intrs )
+        {
             fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-                    data->step, num_far, far_nbrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
+                     data->step, num_far, far_nbrs->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
@@ -165,25 +135,24 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     data->timing.nbrs += t_elapsed;
 
 #if defined(DEBUG)
-    for( i = 0; i < system->N; ++i ) {
-        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                compare_far_nbrs ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+               Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+               compare_far_nbrs );
     }
 #endif
-
-#if defined(DEBUG_FOCUS)  
-    //fprintf( stderr, "nbrs - ");
-    //fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nbrs - ");
+    fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 #endif
-
 #if defined(TEST_ENERGY)
     //Print_Far_Neighbors( system, control, workspace, lists );
 #endif
 }
 
 
-int Estimate_NumNeighbors( reax_system *system, control_params *control, 
+int Estimate_NumNeighbors( reax_system *system, control_params *control,
         static_storage *workspace, list **lists )
 {
     int  i, j, k, l, m, itr;
@@ -195,53 +164,63 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control,
     rvec *nbrs_cp;
     grid *g;
     far_neighbor_data nbr_data;
-
+#ifdef HAVE_CUDA
     int start = 0, finish = 0;
+#endif
 
     // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->g );
     Bin_Atoms( system, workspace );
     // fprintf( stderr, "atoms sorted - " );
     num_far = 0;
+#ifdef HAVE_CUDA
     g->max_cuda_nbrs = 0;
+#endif
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
+                nbrs = &g->nbrs[ index_grid_nbrs(i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs(i,j,k,0,g) ];
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
-                    start = num_far;
+                for(l = 0; l < g->top[ index_grid_3d(i,j,k,g) ]; ++l )
+                {
+                    atom1 = g->atoms[ index_grid_atoms(i,j,k,l,g) ];
 
                     itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
+                    while ( nbrs[itr][0] >= 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[index_grid_3d (x,y,z,g) ];
+                        if ( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                                SQR(control->vlist_cut) )
+                        {
+                            nbr_atoms = &g->atoms[ index_grid_atoms(x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d(x,y,z,g) ];
                             //fprintf( stderr, "\t\tmax: %d\n", max );
 
                             /* pick up another atom from the neighbor cell -
-                               we have to compare atom1 with its own periodic images as well, 
-                               that's why there is also equality in the if stmt below */
-                            for( m = 0; m < max; ++m ) {
+                            we have to compare atom1 with its own periodic images as well,
+                             that's why there is also equality in the if stmt below */
+                            for ( m = 0; m < max; ++m )
+                            {
                                 atom2 = nbr_atoms[m];
                                 //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
-                                if( atom1 > atom2 ) {
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                &nbr_data))
+                                if ( atom1 > atom2 )
+                                {
+                                    if (Are_Far_Neighbors(system->atoms[atom1].x,
+                                                          system->atoms[atom2].x,
+                                                          &(system->box), control->vlist_cut,
+                                                          &nbr_data))
                                         ++num_far;
                                 }
                             }
@@ -250,38 +229,46 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control,
                         ++itr;
                     }
 
-                    // finish note
+#ifdef HAVE_CUDA
                     finish = num_far;
-                    if (g->max_cuda_nbrs <= (finish - start)){
+                    if (g->max_cuda_nbrs <= (finish - start))
+                    {
                         g->max_cuda_nbrs    = finish - start;
                     }
+#endif
                 }
             }
+        }
+    }
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
 #endif
-
     return num_far * SAFE_ZONE;
 }
 
 
-//Code not used anymore
 #if defined DONE
-
-void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
+void Choose_Neighbor_Finder( reax_system *system, control_params *control,
         get_far_neighbors_function *Get_Far_Neighbors )
 {
-    if( control->periodic_boundaries )
+    if ( control->periodic_boundaries )
     {
-        if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
+        if ( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
                 system->box.box_norms[1] > 2.0 * control->vlist_cut &&
                 system->box.box_norms[2] > 2.0 * control->vlist_cut )
+        {
             (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
-        else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+        }
+        else
+        {
+            (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+        }
     }
     else
+    {
         (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
+    }
 }
 
 
@@ -327,18 +314,28 @@ inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
 
     // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
 
-    if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
-        return 1;
+    if ( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
+    {
+        return FALSE;
+    }
 
-    for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
-        if( workspace->restricted_list[ atom1 ][i] == atom2 )
-            return 1;
+    for ( i = 0; i < workspace->restricted[ atom1 ]; ++i )
+    {
+        if ( workspace->restricted_list[ atom1 ][i] == atom2 )
+        {
+            return FALSE;
+        }
+    }
 
-    for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
-        if( workspace->restricted_list[ atom2 ][i] == atom1 )
-            return 1;
+    for ( i = 0; i < workspace->restricted[ atom2 ]; ++i )
+    {
+        if ( workspace->restricted_list[ atom2 ][i] == atom1 )
+        {
+            return FALSE;
+        }
+    }
 
-    return 0;
+    return TRUE;
 }
 
 
@@ -347,17 +344,20 @@ inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
 {
     int i;
 
-    for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
-        if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
+    for ( i = Start_Index(atom1, near_nbrs); i < End_Index(atom1, near_nbrs); ++i )
+    {
+        if ( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
         {
             // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
-            return 1;
+            return FALSE;
         }
+    }
 
-    return 0;
+    return TRUE;
 }
 
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -368,21 +368,20 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     int   num_far;
     int   c, count;
     int   grid_top;
-    grid *g = &( system->g );  
+    grid *g = &( system->g );
     list *far_nbrs = (*lists) + FAR_NBRS;
     //int   hb_type1, hb_type2;
     //list *hbonds = (*lists) + HBOND;
     //int   top_hbond1, top_hbond2;
     get_far_neighbors_function Get_Far_Neighbors;
     far_neighbor_data new_nbrs[125];
-#ifndef REORDER_ATOMS
-    int   l, m;
-#endif
 
     // fprintf( stderr, "\n\tentered nbrs - " );
-    if( control->ensemble == iNPT || control->ensemble == sNPT || 
+    if ( control->ensemble == iNPT || control->ensemble == sNPT ||
             control->ensemble == NPT )
+    {
         Update_Grid( system );
+    }
     // fprintf( stderr, "grid updated - " );
 
     Bin_Atoms( system, out_control );
@@ -394,9 +393,9 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
 #endif
 
     Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-    // fprintf( stderr, "function chosen - " );  
+    // fprintf( stderr, "function chosen - " );
 
-    Reset_Neighbor_Lists( system, workspace, lists );  
+    Reset_Neighbor_Lists( system, workspace, lists );
     // fprintf( stderr, "lists cleared - " );
 
     num_far = 0;
@@ -404,9 +403,12 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     c = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs = g->nbrs[i][j][k];
                 nbrs_cp = g->nbrs_cp[i][j][k];
 
@@ -414,119 +416,137 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                 //#ifdef REORDER_ATOMS
                 //  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
                 //#else
-                for(l = 0; l < g->top[i][j][k]; ++l ){
+                for (l = 0; l < g->top[i][j][k]; ++l )
+                {
                     atom1 = g->atoms[i][j][k][l];
                     Set_End_Index( atom1, num_far, far_nbrs );
                     // fprintf( stderr, "atom %d:\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] > 0 ){
+                    while ( nbrs[itr][0] > 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
 
-                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                        //     SQR(control->r_cut))     
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                        //     SQR(control->r_cut))
                         nbr_atoms = g->atoms[x][y][z];
                         max_atoms = g->top[x][y][z];
 
                         /* pick up another atom from the neighbor cell -
-                           we have to compare atom1 with its own periodic images as well, 
+                           we have to compare atom1 with its own periodic images as well,
                            that's why there is also equality in the if stmt below */
                         //#ifdef REORDER_ATOMS
                         //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
                         //#else
-                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                            if( atom1 >= atom2 ) {
+                        for ( m = 0, atom2 = nbr_atoms[m]; m < max; ++m, atom2 = nbr_atoms[m] )
+                        {
+                            if ( atom1 >= atom2 )
+                            {
                                 //fprintf( stderr, "\tatom2 %d", atom2 );
                                 //top_near1 = End_Index( atom1, near_nbrs );
                                 //Set_Start_Index( atom1, num_far, far_nbrs );
                                 //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
                                 Get_Far_Neighbors( system->atoms[atom1].x,
-                                        system->atoms[atom2].x, 
-                                        &(system->box), control, new_nbrs, &count );
+                                                   system->atoms[atom2].x,
+                                                   &(system->box), control, new_nbrs, &count );
                                 fprintf( stderr, "\t%d count:%d\n", atom2, count );
 
-                                for( c = 0; c < count; ++c )
-                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                for ( c = 0; c < count; ++c )
+                                {
+                                    if (atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d >= 0.1))
+                                    {
                                         Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                atom2, new_nbrs[c].d, 1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                                         atom2, new_nbrs[c].d, 1.0,
+                                                         new_nbrs[c].dvec, new_nbrs[c].rel_box );
                                         ++num_far;
 
                                         /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                          atom1, atom2, new_nbrs[c].d, 
-                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          atom1, atom2, new_nbrs[c].d,
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1],
                                           new_nbrs[c].dvec[2] ); */
 
 
-                                        /* hydrogen bond lists */ 
-                                        /*if( control->hb_cut > 0.1 && 
+                                        /* hydrogen bond lists */
+                                        /*if( control->hb_cut > 0.1 &&
                                           new_nbrs[c].d <= control->hb_cut ) {
-                                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                                        hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
-                                        if( hb_type1 == 1 && hb_type2 == 2 ) {
-                                        top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
-                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
-                                        atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
-                                        new_nbrs[c].rel_box );
-                                        Set_End_Index( workspace->hbond_index[atom1], 
-                                        top_hbond1 + 1, hbonds );
-                                        }
-                                        else if( hb_type1 == 2 && hb_type2 == 1 ) {
-                                        top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
-                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
-                                        atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
-                                        new_nbrs[c].rel_box );
-                                        Set_End_Index( workspace->hbond_index[atom2], 
-                                        top_hbond2 + 1, hbonds );
-                                        }*/
+                                          // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                                          hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
+                                          if( hb_type1 == 1 && hb_type2 == 2 ) {
+                                          top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
+                                          Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
+                                          atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
+                                          new_nbrs[c].rel_box );
+                                          Set_End_Index( workspace->hbond_index[atom1],
+                                          top_hbond1 + 1, hbonds );
+                                          }
+                                          else if( hb_type1 == 2 && hb_type2 == 1 ) {
+                                          top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
+                                          Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
+                                          atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec,
+                                          new_nbrs[c].rel_box );
+                                          Set_End_Index( workspace->hbond_index[atom2],
+                                          top_hbond2 + 1, hbonds );
+                                          }*/
                                     }
                                 }
+                            }
                         }
+                    }
 
                     Set_End_Index( atom1, top_far1, far_nbrs );
                 }
             }
+        }
+    }
 
     fprintf( stderr, "nbrs done-" );
 
+
     /* apply restrictions on near neighbors only */
-    if( (data->step - data->prev_steps) < control->restrict_bonds ) {
-        for( atom1 = 0; atom1 < system->N; ++atom1 )
-            if( workspace->restricted[ atom1 ] ) {
+    if ( (data->step - data->prev_steps) < control->restrict_bonds )
+    {
+        for ( atom1 = 0; atom1 < system->N; ++atom1 )
+        {
+            if ( workspace->restricted[ atom1 ] )
+            {
                 // fprintf( stderr, "atom1: %d\n", atom1 );
 
                 top_near1 = End_Index( atom1, near_nbrs );
 
-                for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
-                    if(!is_Near_Neighbor(near_nbrs, atom1, 
-                                atom2 = workspace->restricted_list[atom1][j])) {
+                for ( j = 0; j < workspace->restricted[ atom1 ]; ++j )
+                {
+                    if (is_Near_Neighbor(near_nbrs, atom1,
+                          atom2 = workspace->restricted_list[atom1][j]) == FALSE)
+                    {
                         fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
-                                atom1, atom2 );
+                                 atom1, atom2 );
 
-                        top_near2 = End_Index( atom2, near_nbrs );          
+                        top_near2 = End_Index( atom2, near_nbrs );
 
-                        /* we just would like to get the nearest image, so a call to 
+                        /* we just would like to get the nearest image, so a call to
                            Get_Periodic_Far_Neighbors_Big_Box is good enough. */
-                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
-                                system->atoms[ atom2 ].x, 
-                                &(system->box), control, 
-                                new_nbrs, &count );
+                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x,
+                                                            system->atoms[ atom2 ].x,
+                                                            &(system->box), control,
+                                                            new_nbrs, &count );
 
                         Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
-                                atom2, new_nbrs[c].d, 1.0, 
-                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                           atom2, new_nbrs[c].d, 1.0,
+                                           new_nbrs[c].dvec, new_nbrs[c].rel_box );
                         ++top_near1;
 
                         Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
-                                atom1, new_nbrs[c].d, -1.0, 
-                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                        Set_End_Index( atom2, top_near2+1, near_nbrs );
+                                           atom1, new_nbrs[c].d, -1.0,
+                                           new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                        Set_End_Index( atom2, top_near2 + 1, near_nbrs );
                     }
+                }
 
                 Set_End_Index( atom1, top_near1, near_nbrs );
             }
+        }
     }
     // fprintf( stderr, "restrictions applied-" );
 
@@ -534,56 +554,61 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     /* verify nbrlists, count num_intrs, sort nearnbrs */
     near_nbrs->num_intrs = 0;
     far_nbrs->num_intrs = 0;
-    for( i = 0; i < system->N-1; ++i ) {
-        if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
-            fprintf( stderr, 
-                    "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
-                    data->step, i+1, i );
-            exit( 1 );
+    for ( i = 0; i < system->N - 1; ++i )
+    {
+        if ( End_Index(i, near_nbrs) > Start_Index(i + 1, near_nbrs) )
+        {
+            fprintf( stderr,
+                     "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
+                     data->step, i + 1, i );
+            exit( RUNTIME_ERROR );
         }
 
         near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
 
-        if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
-            fprintf( stderr, 
-                    "step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
-                    data->step, i+1, i );
-            exit( 1 );
+        if ( End_Index(i, far_nbrs) > Start_Index(i + 1, far_nbrs) )
+        {
+            fprintf( stderr,
+                     "step%3d: farnbr list of atom%d is overwritten by atom%d\n",
+                     data->step, i + 1, i );
+            exit( RUNTIME_ERROR );
         }
 
         far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
     }
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
-                Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
-                compare_near_nbrs );
+               Num_Entries(i, near_nbrs), sizeof(near_neighbor_data),
+               compare_near_nbrs );
     }
     // fprintf( stderr, "near nbrs sorted\n" );
 
+
 #ifdef TEST_ENERGY
     /* for( i = 0; i < system->N; ++i ) {
-       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-       compare_far_nbrs ); 
+       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+       compare_far_nbrs );
        } */
 
-    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
-            num_near / system->N );
-    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
-            num_far / system->N, control->max_far_nbrs );
+    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n",
+             num_near / system->N );
+    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n",
+             num_far / system->N, control->max_far_nbrs );
 #endif
 
     //fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
     //       data->step, num_near, num_far );
 
-    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
-    //   system->N * near_nbrs->intrs_per_unit, 
+    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n",
+    //   system->N * near_nbrs->intrs_per_unit,
     //   system->N * far_nbrs->intrs_per_unit );
 }
 
 
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+void Generate_Neighbor_Lists( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
@@ -603,73 +628,84 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     far_nbrs = (*lists) + FAR_NBRS;
 
     // fprintf( stderr, "\n\tentered nbrs - " );
-    if( control->ensemble == iNPT || 
-            control->ensemble == sNPT || 
+    if ( control->ensemble == iNPT ||
+            control->ensemble == sNPT ||
             control->ensemble == NPT )
+    {
         Update_Grid( system );
+    }
     // fprintf( stderr, "grid updated - " );
 
     Bin_Atoms( system, out_control );
     // fprintf( stderr, "atoms sorted - " );
     Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-    // fprintf( stderr, "function chosen - " );  
-    Reset_Neighbor_Lists( system, workspace, lists );  
+    // fprintf( stderr, "function chosen - " );
+    Reset_Neighbor_Lists( system, workspace, lists );
     // fprintf( stderr, "lists cleared - " );
 
     num_far = 0;
     c = 0;
 
     /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
+    for ( i = 0; i < g->ncell[0]; i++ )
+    {
+        for ( j = 0; j < g->ncell[1]; j++ )
+        {
+            for ( k = 0; k < g->ncell[2]; k++ )
+            {
                 nbrs = g->nbrs[i][j][k];
                 nbrs_cp = g->nbrs_cp[i][j][k];
                 fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[i][j][k]; ++l ){
+                for (l = 0; l < g->top[i][j][k]; ++l )
+                {
                     atom1 = g->atoms[i][j][k][l];
                     Set_Start_Index( atom1, num_far, far_nbrs );
                     fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
-                    while( nbrs[itr][0] > 0 ){
+                    while ( nbrs[itr][0] > 0 )
+                    {
                         x = nbrs[itr][0];
                         y = nbrs[itr][1];
                         z = nbrs[itr][2];
                         fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
 
-                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                        //     SQR(control->r_cut))     
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <=
+                        //     SQR(control->r_cut))
                         nbr_atoms = g->atoms[x][y][z];
                         max = g->top[x][y][z];
                         fprintf( stderr, "\t\tmax: %d\n", max );
 
 
                         /* pick up another atom from the neighbor cell -
-                           we have to compare atom1 with its own periodic images as well, 
+                           we have to compare atom1 with its own periodic images as well,
                            that's why there is also equality in the if stmt below */
-                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                            if( atom1 >= atom2 ) {
+                        for ( m = 0, atom2 = nbr_atoms[m]; m < max; ++m, atom2 = nbr_atoms[m] )
+                        {
+                            if ( atom1 >= atom2 )
+                            {
                                 Get_Far_Neighbors( system->atoms[atom1].x,
-                                        system->atoms[atom2].x, 
-                                        &(system->box), control, new_nbrs, &count );
+                                                   system->atoms[atom2].x,
+                                                   &(system->box), control, new_nbrs, &count );
                                 fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
 
-                                for( c = 0; c < count; ++c )
-                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                for ( c = 0; c < count; ++c )
+                                    if (atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d >= 0.1))
+                                    {
                                         Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                atom2, new_nbrs[c].d, 1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                                         atom2, new_nbrs[c].d, 1.0,
+                                                         new_nbrs[c].dvec, new_nbrs[c].rel_box );
                                         ++num_far;
 
                                         /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                          atom1, atom2, new_nbrs[c].d, 
-                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          atom1, atom2, new_nbrs[c].d,
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1],
                                           new_nbrs[c].dvec[2] ); */
                                     }
                             }
+                        }
 
                         ++itr;
                     }
@@ -677,22 +713,26 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                     Set_End_Index( atom1, num_far, far_nbrs );
                 }
             }
+        }
+    }
 
-    far_nbrs->num_intrs = num_far;  
+    far_nbrs->num_intrs = num_far;
     fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 
 #if defined(DEBUG)
-    for( i = 0; i < system->N; ++i ) {
-        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                compare_far_nbrs ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]),
+               Num_Entries(i, far_nbrs), sizeof(far_neighbor_data),
+               compare_far_nbrs );
     }
 
     fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
-    fprintf( stderr, "\tallocated farnbrs: %6d\n", 
-            system->N * far_nbrs->intrs_per_unit );
+    fprintf( stderr, "\tallocated farnbrs: %6d\n",
+             system->N * far_nbrs->intrs_per_unit );
 #endif
 }
 
 
+
 #endif
diff --git a/PuReMD-GPU/src/neighbors.h b/PuReMD-GPU/src/neighbors.h
index 64c14ad29d5194006aacb057a7d80ef54aeee8e4..8eb5cfc2696f4d354edcf3751dedfd315c6762a3 100644
--- a/PuReMD-GPU/src/neighbors.h
+++ b/PuReMD-GPU/src/neighbors.h
@@ -30,10 +30,8 @@ void Generate_Neighbor_Lists( reax_system*, control_params*, simulation_data*,
 int Estimate_NumNeighbors( reax_system*, control_params*,
    static_storage*, list** );
 
-int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
 
-
-static inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize)
+static inline HOST_DEVICE int index_grid_debug( int x, int y, int z, int blocksize )
 {
     return x * 8 * 8 * blocksize +  
         y * 8 * blocksize +  
diff --git a/PuReMD-GPU/src/param.h b/PuReMD-GPU/src/param.h
deleted file mode 100644
index 2b24b056983233840966a8de29ce902ca6beb981..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/param.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __PARAM_H_
-#define __PARAM_H_
-
-#include "mytypes.h"
-
-#define MAX_LINE 1024
-#define MAX_TOKENS 20
-#define MAX_TOKEN_LEN 1024
-
-
-int Get_Atom_Type( reax_interaction*, char* );
-
-int Tokenize( char*, char*** );
-
-char Read_Force_Field( FILE*, reax_interaction* );
-
-char Read_Control_File( FILE*, reax_system*, control_params*,
-        output_controls* );
-
-
-#endif
diff --git a/PuReMD-GPU/src/pdb_tools.c b/PuReMD-GPU/src/pdb_tools.c
deleted file mode 100644
index a7102da2cf8d3023956539960f93f4e61c116a81..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/pdb_tools.c
+++ /dev/null
@@ -1,628 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
-
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "pdb_tools.h"
-#include "box.h"
-#include "list.h"
-#include "param.h"
-#include "restart.h"
-#include "ctype.h"
-
-
-int is_Valid_Serial( static_storage *workspace, int serial )
-{
-    if ( workspace->map_serials[ serial ] < 0 )
-    {
-        fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n",
-                 serial );
-        fprintf( stderr, "Please correct the input file.Terminating...\n" );
-        exit( INVALID_INPUT );
-    }
-
-    return 1;
-}
-
-
-int Check_Input_Range( int val, int lo, int hi, char *message )
-{
-    if ( val < lo || val > hi )
-    {
-        fprintf( stderr, "%s\nInput %d - Out of range %d-%d. Terminating...\n",
-                 message, val, lo, hi );
-        exit( INVALID_INPUT );
-    }
-
-    return 1;
-}
-
-
-void Trim_Spaces( char *element )
-{
-    int i, j;
-
-    for ( i = 0; element[i] == ' '; ++i ); // skip initial space chars
-
-    for ( j = i; j < strlen(element) && element[j] != ' '; ++j )
-        element[j - i] = toupper( element[j] ); // make uppercase, move to beginning
-    element[j - i] = 0; // finalize the string
-}
-
-
-char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
-               simulation_data *data, static_storage *workspace )
-{
-
-    FILE *pdb;
-    char **tmp;
-    char *s, *s1;
-    char descriptor[9], serial[9];
-    char atom_name[9], res_name[9], res_seq[9];
-    char s_x[9], s_y[9], s_z[9];
-    char occupancy[9], temp_factor[9];
-    char seg_id[9], element[9], charge[9];
-    char alt_loc, chain_id, icode;
-    char s_a[10], s_b[10], s_c[10], s_alpha[9], s_beta[9], s_gamma[9];
-    char s_group[12], s_zValue[9];
-    char *endptr = NULL;
-    int  i, c, c1, pdb_serial, ratom = 0;
-    /* open pdb file */
-    if ( (pdb = fopen(pdb_file, "r")) == NULL )
-    {
-        fprintf( stderr, "Error opening the pdb file!\n" );
-        exit( FILE_NOT_FOUND_ERR );
-    }
-
-
-    /* allocate memory for tokenizing pdb lines */
-    s =   (char*)  malloc( sizeof(char)  * MAX_LINE );
-    s1 =  (char*)  malloc( sizeof(char)  * MAX_LINE );
-    tmp = (char**) malloc( sizeof(char*) * MAX_TOKENS );
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        tmp[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
-
-
-    /* count number of atoms in the pdb file */
-    system->N = 0;
-    while (!feof(pdb))
-    {
-        s[0] = 0;
-        fgets( s, MAX_LINE, pdb );
-
-        tmp[0][0] = 0;
-        c = Tokenize( s, &tmp );
-
-        if ( strncmp( tmp[0], "ATOM", 4 ) == 0 ||
-                strncmp( tmp[0], "HETATM", 6 ) == 0 )
-            (system->N)++;
-    }
-    fclose(pdb);
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "system->N: %d\n", system->N );
-#endif
-
-    /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
-    for ( i = 0; i < MAX_ATOM_ID; ++i )
-        workspace->map_serials[i] = -1;
-
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
-
-
-    /* start reading and processing pdb file */
-    pdb = fopen(pdb_file, "r");
-    c = 0;
-    c1 = 0;
-
-    while (!feof(pdb))
-    {
-        /* clear previous input line */
-        s[0] = 0;
-        for ( i = 0; i < c1; ++i )
-            tmp[i][0] = 0;
-
-        /* read new line and tokenize it */
-        fgets( s, MAX_LINE, pdb );
-        strncpy( s1, s, MAX_LINE - 1 );
-        c1 = Tokenize( s, &tmp );
-
-        /* process new line */
-        if ( strncmp(tmp[0], "ATOM", 4) == 0 || strncmp(tmp[0], "HETATM", 6) == 0 )
-        {
-            if ( strncmp(tmp[0], "ATOM", 4) == 0 )
-            {
-                strncpy( &descriptor[0], s1, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], s1 + 6, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], s1 + 12, 4 );
-                atom_name[4] = 0;
-                alt_loc = s1[16];
-                strncpy( &res_name[0], s1 + 17, 3 );
-                res_name[3] = 0;
-                chain_id = s1[21];
-                strncpy( &res_seq[0], s1 + 22, 4 );
-                res_seq[4] = 0;
-                icode = s1[26];
-                strncpy( &s_x[0], s1 + 30, 8 );
-                s_x[8] = 0;
-                strncpy( &s_y[0], s1 + 38, 8 );
-                s_y[8] = 0;
-                strncpy( &s_z[0], s1 + 46, 8 );
-                s_z[8] = 0;
-                strncpy( &occupancy[0], s1 + 54, 6 );
-                occupancy[6] = 0;
-                strncpy( &temp_factor[0], s1 + 60, 6 );
-                temp_factor[6] = 0;
-                strncpy( &seg_id[0], s1 + 72, 4 );
-                seg_id[4] = 0;
-                strncpy( &element[0], s1 + 76, 2 );
-                element[2] = 0;
-                strncpy( &charge[0], s1 + 78, 2 );
-                charge[2] = 0;
-            }
-            else if (strncmp(tmp[0], "HETATM", 6) == 0)
-            {
-                strncpy( &descriptor[0], s1, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], s1 + 6, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], s1 + 12, 4 );
-                atom_name[4] = 0;
-                alt_loc = s1[16];
-                strncpy( &res_name[0], s1 + 17, 3 );
-                res_name[3] = 0;
-                chain_id = s1[21];
-                strncpy( &res_seq[0], s1 + 22, 4 );
-                res_seq[4] = 0;
-                icode = s1[26];
-                strncpy( &s_x[0], s1 + 30, 8 );
-                s_x[8] = 0;
-                strncpy( &s_y[0], s1 + 38, 8 );
-                s_y[8] = 0;
-                strncpy( &s_z[0], s1 + 46, 8 );
-                s_z[8] = 0;
-                strncpy( &occupancy[0], s1 + 54, 6 );
-                occupancy[6] = 0;
-                strncpy( &temp_factor[0], s1 + 60, 6 );
-                temp_factor[6] = 0;
-                //strncpy( &seg_id[0], s1+72, 4 );      seg_id[4] = 0;
-                strncpy( &element[0], s1 + 76, 2 );
-                element[2] = 0;
-                strncpy( &charge[0], s1 + 78, 2 );
-                charge[2] = 0;
-            }
-
-
-            /* add to mapping */
-            pdb_serial = strtod( &serial[0], &endptr );
-            Check_Input_Range( pdb_serial, 0, MAX_ATOM_ID, "Invalid pdb_serial" );
-            workspace->map_serials[ pdb_serial ] = c;
-            workspace->orig_id[ c ] = pdb_serial;
-            // fprintf( stderr, "map %d --> %d\n", pdb_serial, c );
-
-
-            /* copy atomic positions */
-            system->atoms[c].x[0] = strtod( &s_x[0], &endptr );
-            system->atoms[c].x[1] = strtod( &s_y[0], &endptr );
-            system->atoms[c].x[2] = strtod( &s_z[0], &endptr );
-
-            /* atom name and type */
-            strcpy( system->atoms[c].name, atom_name );
-            Trim_Spaces( element );
-            system->atoms[c].type = Get_Atom_Type( &(system->reaxprm), element );
-
-            /* fprintf( stderr,
-            "%d%8.3f%8.3f%8.3fq:%8.3f occ:%s temp:%s seg_id:%s element:%s\n",
-             system->atoms[c].type,
-             system->atoms[c].x[0], system->atoms[c].x[1], system->atoms[c].x[2],
-             system->atoms[c].q, occupancy, temp_factor, seg_id, element ); */
-            c++;
-        }
-        else if (!strncmp( tmp[0], "CRYST1", 6 ))
-        {
-            sscanf( s1, PDB_CRYST1_FORMAT,
-                    &descriptor[0],
-                    &s_a[0],
-                    &s_b[0],
-                    &s_c[0],
-                    &s_alpha[0],
-                    &s_beta[0],
-                    &s_gamma[0],
-                    &s_group[0],
-                    &s_zValue[0] );
-
-            /* Compute full volume tensor from the angles */
-            Init_Box_From_CRYST( atof(s_a),  atof(s_b), atof(s_c),
-                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
-                                 &(system->box) );
-        }
-
-        /* IMPORTANT: We do not check for the soundness of restrictions here.
-           When atom2 is on atom1's restricted list, and there is a restriction on
-           atom2, then atom1 has to be on atom2's restricted list, too. However,
-           we do not check if this is the case in the input file,
-           this is upto the user. */
-        else if (!strncmp( tmp[0], "CONECT", 6 ))
-        {
-            /* error check */
-            //fprintf(stderr, "CONECT: %d\n", c1 );
-            Check_Input_Range( c1 - 2, 0, MAX_RESTRICT,
-                               "CONECT line exceeds max restrictions allowed.\n" );
-
-            /* read bond restrictions */
-            if ( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[1]) ) )
-                ratom = workspace->map_serials[ pdb_serial ];
-
-            workspace->restricted[ ratom ] = c1 - 2;
-            for ( i = 2; i < c1; ++i )
-            {
-                if ( is_Valid_Serial( workspace, pdb_serial = atoi(tmp[i]) ) )
-                    workspace->restricted_list[ (ratom * MAX_RESTRICT) +  (i - 2) ] =
-                        workspace->map_serials[ pdb_serial ];
-            }
-
-            /* fprintf( stderr, "restriction on %d:", ratom );
-            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
-             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
-             fprintf( stderr, "\n" ); */
-        }
-    }
-
-    fclose(pdb);
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "pdb file read\n" );
-#endif
-
-    return 1;
-}
-
-
-char Write_PDB( reax_system* system, control_params *control,
-                simulation_data *data, static_storage *workspace,
-                list* bonds, output_controls *out_control )
-{
-    int  i, j, k, count;
-    int  connect[4];
-    char temp[MAX_STR], name[10];
-    real bo;
-    real alpha, beta, gamma;
-
-
-    /* open output pdb file */
-    sprintf( temp, "%s%d.pdb", control->sim_name, data->step );
-    out_control->pdb = fopen( temp, "w" );
-
-
-    /* Writing Box information */
-    /* Write full volume tensor from the angles (as soon as possible) TODO_SOON */
-    gamma = acos( (system->box.box[0][0] * system->box.box[1][0] +
-                   system->box.box[0][1] * system->box.box[1][1] +
-                   system->box.box[0][2] * system->box.box[1][2]) /
-                  (system->box.box_norms[0] * system->box.box_norms[1]));
-    beta  = acos( (system->box.box[0][0] * system->box.box[2][0] +
-                   system->box.box[0][1] * system->box.box[2][1] +
-                   system->box.box[0][2] * system->box.box[2][2]) /
-                  (system->box.box_norms[0] * system->box.box_norms[2]));
-    alpha = acos( (system->box.box[2][0] * system->box.box[1][0] +
-                   system->box.box[2][1] * system->box.box[1][1] +
-                   system->box.box[2][2] * system->box.box[1][2]) /
-                  (system->box.box_norms[2] * system->box.box_norms[1]));
-
-    fprintf(out_control->pdb, PDB_CRYST1_FORMAT_O,
-            "CRYST1",
-            system->box.box_norms[0],
-            system->box.box_norms[1],
-            system->box.box_norms[2],
-            RAD2DEG(alpha),
-            RAD2DEG(beta),
-            RAD2DEG(gamma),
-            " ",
-            0);
-    fprintf( out_control->log, "Box written\n" );
-    fflush( out_control->log );
-
-    /* Writing atom information */
-    for (i = 0; i < system->N; i++)
-    {
-        strncpy( name, system->reaxprm.sbp[system->atoms[i].type].name, 2 );
-        name[2] = '\0';
-        fprintf( out_control->pdb, PDB_ATOM_FORMAT_O,
-                 "ATOM  ",
-                 workspace->orig_id[i],
-                 name,
-                 ' ',
-                 "REX",
-                 ' ',
-                 1,
-                 ' ',
-                 system->atoms[i].x[0],
-                 system->atoms[i].x[1],
-                 system->atoms[i].x[2],
-                 1.0,
-                 0.0,
-                 "0",
-                 name,
-                 "  " );
-    }
-
-    fprintf( out_control->log, "ATOM written\n" );
-    fflush( out_control->log );
-
-    /* Writing connect information */
-    for (i = 0; i < system->N; i++)
-    {
-        count = 0;
-
-        for (j = Start_Index(i, bonds); j < End_Index(i, bonds); ++j)
-        {
-            bo = bonds->select.bond_list[j].bo_data.BO;
-            if (bo > 0.3)
-            {
-                connect[count] = workspace->orig_id[bonds->select.bond_list[j].nbr];
-                count++;
-            }
-        }
-
-        fprintf( out_control->pdb, "%6s%6d", "CONECT", workspace->orig_id[i] );
-        for ( k = 0; k < count; k++ )
-            fprintf( out_control->pdb, "%6d", connect[k] );
-        fprintf( out_control->pdb, "\n" );
-    }
-
-    fprintf( out_control->pdb, "END\n" );
-
-    fclose( out_control->pdb );
-
-    return 1;
-}
-
-
-char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
-               simulation_data *data, static_storage *workspace )
-{
-    FILE *bgf;
-    char **tokens;
-    char *line, *backup;
-    char descriptor[10], serial[10];
-    char atom_name[10], res_name[10], res_seq[10];
-    char s_x[12], s_y[12], s_z[12];
-    char occupancy[10], temp_factor[10];
-    char element[10], charge[10];
-    char chain_id;
-    char s_a[12], s_b[12], s_c[12], s_alpha[12], s_beta[12], s_gamma[12];
-    char *endptr = NULL;
-    int  i, atom_cnt, token_cnt, bgf_serial, ratom = 0;
-
-    /* open biograf file */
-    if ( (bgf = fopen( bgf_file, "r" )) == NULL )
-    {
-        fprintf( stderr, "Error opening the bgf file!\n" );
-        exit( FILE_NOT_FOUND_ERR );
-    }
-
-
-    /* allocate memory for tokenizing biograf file lines */
-    line   = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    backup = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS );
-    for ( i = 0; i < MAX_TOKENS; i++ )
-        tokens[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
-
-
-    /* count number of atoms in the pdb file */
-    system->N = 0;
-    while ( !feof( bgf ) )
-    {
-        line[0] = 0;
-        fgets( line, MAX_LINE, bgf );
-
-        tokens[0][0] = 0;
-        token_cnt = Tokenize( line, &tokens );
-
-        if ( !strcmp( tokens[0], "ATOM" ) || !strcmp( tokens[0], "HETATM" ) )
-            (system->N)++;
-    }
-    //fprintf( stderr, "system->N: %d\n", system->N );
-    fclose( bgf );
-
-
-    /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
-    for ( i = 0; i < MAX_ATOM_ID; ++i )
-        workspace->map_serials[i] = -1;
-
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
-
-
-    /* start reading and processing pdb file */
-    bgf = fopen( bgf_file, "r" );
-    atom_cnt = 0;
-    token_cnt = 0;
-
-    while ( !feof( bgf ) )
-    {
-        /* clear previous input line */
-        line[0] = 0;
-        for ( i = 0; i < token_cnt; ++i )
-            tokens[i][0] = 0;
-
-        /* read new line and tokenize it */
-        fgets( line, MAX_LINE, bgf );
-        strncpy( backup, line, MAX_LINE - 1 );
-        token_cnt = Tokenize( line, &tokens );
-
-        /* process new line */
-        if ( !strncmp(tokens[0], "ATOM", 4) || !strncmp(tokens[0], "HETATM", 6) )
-        {
-            if ( !strncmp(tokens[0], "ATOM", 4) )
-            {
-                strncpy( &descriptor[0], backup, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], backup + 7, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], backup + 13, 5 );
-                atom_name[5] = 0;
-                strncpy( &res_name[0], backup + 19, 3 );
-                res_name[3] = 0;
-                chain_id = backup[23];
-                strncpy( &res_seq[0], backup + 25, 5 );
-                res_seq[5] = 0;
-                strncpy( &s_x[0], backup + 30, 10 );
-                s_x[10] = 0;
-                strncpy( &s_y[0], backup + 40, 10 );
-                s_y[10] = 0;
-                strncpy( &s_z[0], backup + 50, 10 );
-                s_z[10] = 0;
-                strncpy( &element[0], backup + 61, 5 );
-                element[5] = 0;
-                strncpy( &occupancy[0], backup + 66, 3 );
-                occupancy[3] = 0;
-                strncpy( &temp_factor[0], backup + 69, 2 );
-                temp_factor[2] = 0;
-                strncpy( &charge[0], backup + 72, 8 );
-                charge[8] = 0;
-            }
-            else if ( !strncmp(tokens[0], "HETATM", 6) )
-            {
-                /* bgf hetatm:
-                   (7x,i5,1x,a5,1x,a3,1x,a1,1x,a5,3f10.5,1x,a5,i3,i2,1x,f8.5) */
-                strncpy( &descriptor[0], backup, 6 );
-                descriptor[6] = 0;
-                strncpy( &serial[0], backup + 7, 5 );
-                serial[5] = 0;
-                strncpy( &atom_name[0], backup + 13, 5 );
-                atom_name[5] = 0;
-                strncpy( &res_name[0], backup + 19, 3 );
-                res_name[3] = 0;
-                chain_id = backup[23];
-                strncpy( &res_seq[0], backup + 25, 5 );
-                res_seq[5] = 0;
-                strncpy( &s_x[0], backup + 30, 10 );
-                s_x[10] = 0;
-                strncpy( &s_y[0], backup + 40, 10 );
-                s_y[10] = 0;
-                strncpy( &s_z[0], backup + 50, 10 );
-                s_z[10] = 0;
-                strncpy( &element[0], backup + 61, 5 );
-                element[5] = 0;
-                strncpy( &occupancy[0], backup + 66, 3 );
-                occupancy[3] = 0;
-                strncpy( &temp_factor[0], backup + 69, 2 );
-                temp_factor[2] = 0;
-                strncpy( &charge[0], backup + 72, 8 );
-                charge[8] = 0;
-            }
-
-
-            /* add to mapping */
-            bgf_serial = strtod( &serial[0], &endptr );
-            Check_Input_Range( bgf_serial, 0, MAX_ATOM_ID, "Invalid bgf serial" );
-            workspace->map_serials[ bgf_serial ] = atom_cnt;
-            workspace->orig_id[ atom_cnt ] = bgf_serial;
-            // fprintf( stderr, "map %d --> %d\n", bgf_serial, atom_cnt );
-
-
-            /* copy atomic positions */
-            system->atoms[atom_cnt].x[0] = strtod( &s_x[0], &endptr );
-            system->atoms[atom_cnt].x[1] = strtod( &s_y[0], &endptr );
-            system->atoms[atom_cnt].x[2] = strtod( &s_z[0], &endptr );
-
-
-            /* atom name and type */
-            //BGF_FIX
-            atom_name[4] = 0;
-            //BGF_FIX
-
-            strcpy( system->atoms[atom_cnt].name, atom_name );
-            Trim_Spaces( element );
-            system->atoms[atom_cnt].type =
-                Get_Atom_Type( &(system->reaxprm), element );
-
-            /* fprintf( stderr,
-            "a:%3d(%1d) c:%10.5f%10.5f%10.5f q:%10.5f occ:%s temp:%s seg_id:%s element:%s\n",
-             atom_cnt, system->atoms[ atom_cnt ].type,
-             system->atoms[ atom_cnt ].x[0],
-             system->atoms[ atom_cnt ].x[1], system->atoms[ atom_cnt ].x[2],
-             system->atoms[ atom_cnt ].q, occupancy, temp_factor,
-             seg_id, element ); */
-
-            atom_cnt++;
-        }
-        else if (!strncmp( tokens[0], "CRYSTX", 6 ))
-        {
-            sscanf( backup, BGF_CRYSTX_FORMAT,
-                    &descriptor[0],
-                    &s_a[0],
-                    &s_b[0],
-                    &s_c[0],
-                    &s_alpha[0],
-                    &s_beta[0],
-                    &s_gamma[0] );
-
-            /* Compute full volume tensor from the angles */
-            Init_Box_From_CRYST( atof(s_a),  atof(s_b), atof(s_c),
-                                 atof(s_alpha), atof(s_beta), atof(s_gamma),
-                                 &(system->box) );
-        }
-        else if (!strncmp( tokens[0], "CONECT", 6 ))
-        {
-            /* check number of restrictions */
-            Check_Input_Range( token_cnt - 2, 0, MAX_RESTRICT,
-                               "CONECT line exceeds max restrictions allowed.\n" );
-
-            /* read bond restrictions */
-            if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[1]) ) )
-                ratom = workspace->map_serials[ bgf_serial ];
-
-            workspace->restricted[ ratom ] = token_cnt - 2;
-            for ( i = 2; i < token_cnt; ++i )
-                if ( is_Valid_Serial( workspace, bgf_serial = atoi(tokens[i]) ) )
-                    workspace->restricted_list[ (ratom * MAX_RESTRICT) + (i - 2) ] =
-                        workspace->map_serials[ bgf_serial ];
-
-            /* fprintf( stderr, "restriction on %d:", ratom );
-            for( i = 0; i < workspace->restricted[ ratom ]; ++i )
-             fprintf( stderr, "  %d", workspace->restricted_list[ratom][i] );
-             fprintf( stderr, "\n" ); */
-        }
-    }
-
-    fclose( bgf );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "bgf file read\n" );
-#endif
-
-    return 1;
-}
diff --git a/PuReMD-GPU/src/print_utils.c b/PuReMD-GPU/src/print_utils.c
index 913ff617a23f9f395a300a0985dec7ad36c33fab..d0f0e1bad12720a793074fbbc1e4d194ebb8fd8e 100644
--- a/PuReMD-GPU/src/print_utils.c
+++ b/PuReMD-GPU/src/print_utils.c
@@ -19,9 +19,11 @@
   ----------------------------------------------------------------------*/
 
 #include "print_utils.h"
+
+#include "geo_tools.h"
 #include "list.h"
-#include "pdb_tools.h"
 #include "system_props.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
@@ -374,18 +376,6 @@ void Init_Force_Test_Functions( )
 #endif
 
 
-char *Get_Element( reax_system *system, int i )
-{
-    return &( system->reaxprm.sbp[system->atoms[i].type].name[0] );
-}
-
-
-char *Get_Atom_Name( reax_system *system, int i )
-{
-    return &(system->atoms[i].name[0]);
-}
-
-
 /* near nbrs contain both i-j, j-i nbrhood info */
 void Print_Near_Neighbors( reax_system *system, control_params *control,
                            static_storage *workspace, list **lists )
@@ -625,35 +615,48 @@ void Output_Results( reax_system *system, control_params *control,
                  data->E_vdW, data->E_Ele, data->E_Pol );
 #endif
 
-#ifdef __PRINT_CPU_RESULTS__
+#ifndef HAVE_CUDA
         t_elapsed = Get_Timing_Info( data->timing.total );
         if ( data->step == data->prev_steps )
             f_update = 1;
         else f_update = out_control->energy_update_freq;
 
-        fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f\n",
+        fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                  data->step, t_elapsed / f_update,
                  data->timing.nbrs / f_update,
                  data->timing.init_forces / f_update,
                  data->timing.bonded / f_update,
                  data->timing.nonb / f_update,
                  data->timing.QEq / f_update,
-                 (double)data->timing.matvecs / f_update );
+                 data->timing.QEq_sort_mat_rows / f_update,
+                 (double)data->timing.solver_iters / f_update,
+                 data->timing.pre_comp / f_update,
+                 data->timing.pre_app / f_update,
+                 data->timing.solver_spmv / f_update,
+                 data->timing.solver_vector_ops / f_update,
+                 data->timing.solver_orthog / f_update,
+                 data->timing.solver_tri_solve / f_update );
 #else
         t_elapsed = Get_Timing_Info( d_timing.total );
         if ( data->step == data->prev_steps )
             f_update = 1;
         else f_update = out_control->energy_update_freq;
 
-        fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f\n",
+        fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                  data->step, t_elapsed / f_update,
-                 d_timing.nbrs / f_update,
-                 d_timing.init_forces / f_update,
-                 d_timing.bonded / f_update,
-                 d_timing.nonb / f_update,
-                 d_timing.QEq / f_update,
-                 (double)d_timing.matvecs / f_update );
-
+                 d_timing->timing.nbrs / f_update,
+                 d_timing->timing.init_forces / f_update,
+                 d_timing->timing.bonded / f_update,
+                 d_timing->timing.nonb / f_update,
+                 d_timing->timing.QEq / f_update,
+                 d_timing->timing.QEq_sort_mat_rows / f_update,
+                 (double)d_timing->timing.solver_iters / f_update,
+                 d_timing->timing.pre_comp / f_update,
+                 d_timing->timing.pre_app / f_update,
+                 d_timing->timing.solver_spmv / f_update,
+                 d_timing->timing.solver_vector_ops / f_update,
+                 d_timing->timing.solver_orthog / f_update,
+                 d_timing->timing.solver_tri_solve / f_update );
 #endif
 
         //fprintf (stderr, " total %10.5f \n", t_elapsed);
@@ -673,16 +676,32 @@ void Output_Results( reax_system *system, control_params *control,
         data->timing.init_forces = 0;
         data->timing.bonded = 0;
         data->timing.nonb = 0;
-        data->timing.QEq = 0;
-        data->timing.matvecs = 0;
-
+        data->timing.QEq = ZERO;
+        data->timing.QEq_sort_mat_rows = ZERO;
+        data->timing.pre_comp = ZERO;
+        data->timing.pre_app = ZERO;
+        data->timing.solver_iters = 0;
+        data->timing.solver_spmv = ZERO;
+        data->timing.solver_vector_ops = ZERO;
+        data->timing.solver_orthog = ZERO;
+        data->timing.solver_tri_solve = ZERO;
+
+#ifdef HAVE_CUDA
         d_timing.total = Get_Time( );
         d_timing.nbrs = 0;
         d_timing.init_forces = 0;
         d_timing.bonded = 0;
         d_timing.nonb = 0;
-        d_timing.QEq = 0;
-        d_timing.matvecs = 0;
+        d_timing->timing.QEq = ZERO;
+        d_timing->timing.QEq_sort_mat_rows = ZERO;
+        d_timing->timing.pre_comp = ZERO;
+        d_timing->timing.pre_app = ZERO;
+        d_timing->timing.solver_iters = 0;
+        d_timing->timing.solver_spmv = ZERO;
+        d_timing->timing.solver_vector_ops = ZERO;
+        d_timing->timing.solver_orthog = ZERO;
+        d_timing->timing.solver_tri_solve = ZERO;
+#endif
 
         fflush( out_control->out );
         fflush( out_control->pot );
@@ -716,16 +735,16 @@ void Output_Results( reax_system *system, control_params *control,
     if ( out_control->write_steps > 0 &&
             data->step % out_control->write_steps == 0 )
     {
-        // t_start = Get_Time( );
+        //t_start = Get_Time( );
         out_control->append_traj_frame( system, control, data,
                                         workspace, lists, out_control );
 
-        //Write_PDB( system, control, data, workspace, *lists+BONDS, out_control );
-        // t_elapsed = Get_Timing_Info( t_start );
-        // fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
+        //Write_PDB( system, *lists+BONDS, data, control, workspace, out_control );
+        //t_elapsed = Get_Timing_Info( t_start );
+        //fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
     }
 
-    // fprintf( stderr, "output_results... done\n" );
+    //fprintf( stderr, "output_results... done\n" );
 }
 
 
@@ -759,23 +778,46 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
     sprintf( fname, "%s.H%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
-    H = &workspace->H;
+    H = workspace->H;
 
     for ( i = 0; i < system->N; ++i )
     {
         for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
         {
             fprintf( out, "%6d%6d %24.15e\n",
-                     workspace->orig_id[i], workspace->orig_id[H->entries[j].j],
-                     H->entries[j].val );
+                     workspace->orig_id[i], workspace->orig_id[H->j[j]],
+                     H->val[j] );
 
             fprintf( out, "%6d%6d %24.15e\n",
-                     workspace->orig_id[H->entries[j].j], workspace->orig_id[i],
-                     H->entries[j].val );
+                     workspace->orig_id[H->j[j]], workspace->orig_id[i],
+                     H->val[j] );
         }
         // the diagonal entry
         fprintf( out, "%6d%6d %24.15e\n",
-                 workspace->orig_id[i], workspace->orig_id[i], H->entries[j].val );
+                 workspace->orig_id[i], workspace->orig_id[i], H->val[j] );
+    }
+
+    fclose( out );
+
+    sprintf( fname, "%s.H_sp%d.out", control->sim_name, step );
+    out = fopen( fname, "w" );
+    H = workspace->H_sp;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
+        {
+            fprintf( out, "%6d%6d %24.15e\n",
+                     workspace->orig_id[i], workspace->orig_id[H->j[j]],
+                     H->val[j] );
+
+            fprintf( out, "%6d%6d %24.15e\n",
+                     workspace->orig_id[H->j[j]], workspace->orig_id[i],
+                     H->val[j] );
+        }
+        // the diagonal entry
+        fprintf( out, "%6d%6d %24.15e\n",
+                 workspace->orig_id[i], workspace->orig_id[i], H->val[j] );
     }
 
     fclose( out );
@@ -834,11 +876,11 @@ void Print_Sparse_Matrix( sparse_matrix *A )
 {
     int i, j;
 
-    for ( i = 0; i < 10; ++i )
+    for ( i = 0; i < A->n; ++i )
     {
         fprintf( stderr, "i:%d  j(val):", i );
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
-            fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
+            fprintf( stderr, "%d(%.4f) ", A->j[j], A->val[j] );
         fprintf( stderr, "\n" );
     }
 }
@@ -850,8 +892,14 @@ void Print_Sparse_Matrix2( sparse_matrix *A, char *fname )
     FILE *f = fopen( fname, "w" );
 
     for ( i = 0; i < A->n; ++i )
+    {
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
-            fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
+        {
+            //fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
+            //Convert 0-based to 1-based (for Matlab)
+            fprintf( f, "%6d%6d %24.15e\n", i+1, A->j[j]+1, A->val[j] );
+        }
+    }
 
     fclose(f);
 }
diff --git a/PuReMD-GPU/src/print_utils.h b/PuReMD-GPU/src/print_utils.h
index 5f479bdc99fa30c518f69b5a23fa88b19af1a306..46d08516e00b002792d507b5effd7bd1ee5d551d 100644
--- a/PuReMD-GPU/src/print_utils.h
+++ b/PuReMD-GPU/src/print_utils.h
@@ -23,28 +23,25 @@
 
 #include "mytypes.h"
 
+
 typedef void (*print_interaction)(reax_system*, control_params*, simulation_data*,
-                                  static_storage*, list**, output_controls*);
-extern print_interaction Print_Interactions[NO_OF_INTERACTIONS];
+        static_storage*, list**, output_controls*);
 
-char *Get_Element( reax_system*, int );
+extern print_interaction Print_Interactions[NO_OF_INTERACTIONS];
 
-char *Get_Atom_Name( reax_system*, int );
 
-void Print_Near_Neighbors( reax_system*, control_params*, static_storage*,
-                           list** );
+void Print_Near_Neighbors( reax_system*, control_params*, static_storage*, list** );
 
-void Print_Far_Neighbors( reax_system*, control_params*, static_storage*,
-                          list** );
+void Print_Far_Neighbors( reax_system*, control_params*, static_storage*, list** );
 
 void Print_Total_Force( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Output_Results( reax_system*, control_params*, simulation_data*,
-                     static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Print_Bond_Orders( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 
 void Print_Linear_System( reax_system*, control_params*, static_storage*, int );
 
@@ -61,23 +58,23 @@ void Print_Bond_List2( reax_system*, list*, char* );
 
 #ifdef TEST_FORCES
 void Dummy_Printer( reax_system*, control_params*, simulation_data*,
-                    static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_Bond_Forces( reax_system*, control_params*, simulation_data*,
-                        static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_LonePair_Forces( reax_system*, control_params*, simulation_data*,
-                            static_storage*, list**, output_controls* );
-void Print_OverUnderCoor_Forces(reax_system*, control_params*, simulation_data*,
-                                static_storage*, list**, output_controls*);
+        static_storage*, list**, output_controls* );
+void Print_OverUnderCoor_Forces(reax_system*, control_params*,
+        simulation_data*, static_storage*, list**, output_controls*);
 void Print_Three_Body_Forces( reax_system*, control_params*, simulation_data*,
-                              static_storage*, list**, output_controls* );
-void Print_Hydrogen_Bond_Forces(reax_system*, control_params*, simulation_data*,
-                                static_storage*, list**, output_controls*);
+        static_storage*, list**, output_controls* );
+void Print_Hydrogen_Bond_Forces(reax_system*, control_params*,
+        simulation_data*, static_storage*, list**, output_controls*);
 void Print_Four_Body_Forces( reax_system*, control_params*, simulation_data*,
-                             static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Print_vdW_Coulomb_Forces( reax_system*, control_params*, simulation_data*,
-                               static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Compare_Total_Forces( reax_system*, control_params*, simulation_data*,
-                           static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Init_Force_Test_Functions( );
 #endif
 
diff --git a/sPuReMD/src/QEq.c b/PuReMD-GPU/src/qeq.c
similarity index 92%
rename from sPuReMD/src/QEq.c
rename to PuReMD-GPU/src/qeq.c
index 026a3ae1a84e4912958737161818aea77080da44..c1e646bf6ba99040253195d3188b8b74d61fabff 100644
--- a/sPuReMD/src/QEq.c
+++ b/PuReMD-GPU/src/qeq.c
@@ -19,9 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "QEq.h"
+#include "qeq.h"
 
 #include "allocate.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lin_alg.h"
 #include "print_utils.h"
@@ -1539,40 +1540,48 @@ static void Init_MatVec( const reax_system * const system, const control_params
     for ( i = 0; i < system->N; ++i )
     {
         // no extrapolation
-        //s_tmp = workspace->s[0][i];
-        //t_tmp = workspace->t[0][i];
+        //s_tmp = workspace->s[index_wkspace_sys(0,i,system->N)];
+        //t_tmp = workspace->t[index_wkspace_sys(0,i,system->N)];
 
         // linear
-        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+        //s_tmp = 2 * workspace->s[index_wkspace_sys(0,i,system->N)] - workspace->s[index_wkspace_sys(1,i,system->N)];
+        //t_tmp = 2 * workspace->t[index_wkspace_sys(0,i,system->N)] - workspace->t[index_wkspace_sys(1,i,system->N)];
 
         // quadratic
-        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-        t_tmp = workspace->t[2][i] + 3 * (workspace->t[0][i] - workspace->t[1][i]);
+//        s_tmp = workspace->s[index_wkspace_sys(2,i,system->N)] +
+//            3 * (workspace->s[index_wkspace_sys(0,i,system->N)]-workspace->s[index_wkspace_sys(1,i,system->N)]);
+        t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] +
+            3 * (workspace->t[index_wkspace_sys(0,i,system->N)] -workspace->t[index_wkspace_sys(1,i,system->N)]);
 
         // cubic
-        s_tmp = 4 * (workspace->s[0][i] + workspace->s[2][i]) -
-                (6 * workspace->s[1][i] + workspace->s[3][i] );
-        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) -
-        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) -
+            (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)]);
+        //t_tmp = 4 * (workspace->t[index_wkspace_sys(0,i,system->N)] + workspace->t[index_wkspace_sys(2,i,system->N)]) -
+        //  (6 * workspace->t[index_wkspace_sys(1,i,system->N)] + workspace->t[index_wkspace_sys(3,i,system->N)] );
 
         // 4th order
-        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) +
-        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) +
-        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-
-        workspace->s[4][i] = workspace->s[3][i];
-        workspace->s[3][i] = workspace->s[2][i];
-        workspace->s[2][i] = workspace->s[1][i];
-        workspace->s[1][i] = workspace->s[0][i];
-        workspace->s[0][i] = s_tmp;
-
-        workspace->t[4][i] = workspace->t[3][i];
-        workspace->t[3][i] = workspace->t[2][i];
-        workspace->t[2][i] = workspace->t[1][i];
-        workspace->t[1][i] = workspace->t[0][i];
-        workspace->t[0][i] = t_tmp;
+//        s_tmp = 5 * (workspace->s[index_wkspace_sys(0,i,system->N)] -
+//                workspace->s[index_wkspace_sys(3,i,system->N)]) + 10 *
+//            (-workspace->s[index_wkspace_sys(1,i,system->N)] +
+//             workspace->s[index_wkspace_sys(2,i,system->N)] ) +
+//            workspace->s[index_wkspace_sys(4,i,system->N)];
+//        t_tmp = 5 * (workspace->t[index_wkspace_sys(0,i,system->N)] -
+//                workspace->t[index_wkspace_sys(3,i,system->N)]) + 10 *
+//            (-workspace->t[index_wkspace_sys(1,i,system->N)] +
+//             workspace->t[index_wkspace_sys(2,i,system->N)] ) +
+//            workspace->t[index_wkspace_sys(4,i,system->N)];
+
+        workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)];
+        workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; 
+        workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)];
+        workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)];
+        workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp;
+
+        workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)];
+        workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; 
+        workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)];
+        workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)];
+        workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp;
     }
 }
 
@@ -1587,14 +1596,15 @@ static void Calculate_Charges( const reax_system * const system, static_storage
     s_sum = t_sum = 0.;
     for ( i = 0; i < system->N; ++i )
     {
-        s_sum += workspace->s[0][i];
-        t_sum += workspace->t[0][i];
+        s_sum += workspace->s[index_wkspace_sys(0,i,system->N)];
+        t_sum += workspace->t[index_wkspace_sys(0,i,system->N)];
     }
 
     u = s_sum / t_sum;
     for ( i = 0; i < system->N; ++i )
     {
-        system->atoms[i].q = workspace->s[0][i] - u * workspace->t[0][i];
+        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)]
+            - u * workspace->t[index_wkspace_sys(0,i,system->N)];
     }
 }
 
@@ -1614,49 +1624,32 @@ void QEq( reax_system * const system, control_params * const control, simulation
 
     Init_MatVec( system, control, data, workspace, far_nbrs );
 
-//    if( data->step == 0 || data->step == 100 )
-//    {
-//      Print_Linear_System( system, control, workspace, data->step );
-//    }
-
     switch ( control->qeq_solver_type )
     {
     case GMRES_S:
         iters = GMRES( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                       workspace->s[0], out_control->log,
-                       ((data->step - data->prev_steps) % control->pre_comp_refactor == 0) ? TRUE : FALSE );
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log,
+                ((data->step - data->prev_steps) % control->pre_comp_refactor == 0) ? TRUE : FALSE );
         iters += GMRES( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                        workspace->t[0], out_control->log, FALSE );
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, FALSE );
         break;
     case GMRES_H_S:
         iters = GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                                   workspace->s[0], out_control->log, (data->step - data->prev_steps) % control->pre_comp_refactor == 0 );
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, (data->step - data->prev_steps) % control->pre_comp_refactor == 0 );
         iters += GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                                    workspace->t[0], out_control->log, 0 );
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, 0 );
         break;
     case CG_S:
         iters = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                    workspace->s[0], out_control->log ) + 1;
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
         iters += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                     workspace->t[0], out_control->log ) + 1;
-//            iters = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-//                    workspace->L, workspace->U, workspace->s[0], control->pre_app_type,
-//                    control->pre_app_jacobi_iters, out_control->log ) + 1;
-//            iters += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-//                    workspace->L, workspace->U, workspace->t[0], control->pre_app_type,
-//                    control->pre_app_jacobi_iters, out_control->log ) + 1;
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
         break;
     case SDM_S:
         iters = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                     workspace->s[0], out_control->log ) + 1;
+                &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
         iters += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                      workspace->t[0], out_control->log ) + 1;
-//            iters = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-//                    workspace->L, workspace->U, workspace->s[0], control->pre_app_type,
-//                    control->pre_app_jacobi_iters, out_control->log ) + 1;
-//            iters += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-//                    workspace->L, workspace->U, workspace->t[0], control->pre_app_type,
-//                    control->pre_app_jacobi_iters, out_control->log ) + 1;
+                &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log ) + 1;
         break;
     default:
         fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
@@ -1671,12 +1664,4 @@ void QEq( reax_system * const system, control_params * const control, simulation
 #endif
 
     Calculate_Charges( system, workspace );
-
-    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n",
-    //   data->step,
-    //   workspace->s[0][0], workspace->t[0][0],
-    //   workspace->s[0][1], workspace->t[0][1],
-    //   workspace->s[0][2], workspace->t[0][2] );
-    // if( data->step == control->nsteps )
-    //Print_Charges( system, control, workspace, data->step );
 }
diff --git a/PuReMD-GPU/src/qeq.h b/PuReMD-GPU/src/qeq.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c6c7ea2ce396f5f3cd9a538b801f2658199b7d9
--- /dev/null
+++ b/PuReMD-GPU/src/qeq.h
@@ -0,0 +1,73 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __QEq_H_
+#define __QEq_H_
+
+#include "mytypes.h"
+
+
+void QEq( reax_system* const, control_params* const, simulation_data* const,
+          static_storage* const, const list* const,
+          const output_controls* const );
+
+
+//static inline HOST_DEVICE void swap( sparse_matrix_entry *array,
+//        int index1, int index2 ) 
+//{
+//    sparse_matrix_entry temp = array[index1];
+//    array[index1] = array[index2];
+//    array[index2] = temp;
+//}
+//
+//
+//static inline HOST_DEVICE void quick_sort( sparse_matrix_entry *array,
+//        int start, int end )
+//{
+//    int i = start;
+//    int k = end; 
+//
+//    if (end - start >= 1)  
+//    {  
+//        int pivot = array[start].j;
+//
+//        while (k > i) 
+//        {  
+//            while ((array[i].j <= pivot) && (i <= end) && (k > i))
+//            {
+//                i++;
+//            }
+//            while ((array[k].j > pivot) && (k >= start) && (k >= i))
+//            {
+//                k--;
+//            }
+//            if (k > i)
+//            {
+//                swap( array, i, k );
+//            }
+//        }  
+//        swap( array, start, k );
+//        quick_sort( array, start, k - 1 );
+//        quick_sort( array, k + 1, end );
+//    }  
+//}
+
+
+#endif
diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h
index b19bc58e3dcef04a324b108be718bfbff3e5c06c..a936477278d06a989d50ab0faeafb8a737a4e5fd 100644
--- a/PuReMD-GPU/src/random.h
+++ b/PuReMD-GPU/src/random.h
@@ -58,7 +58,7 @@ static inline HOST_DEVICE double GRandom(double mean, double sigma)
         rsq = v1 * v1 + v2 * v2;
     }
 
-    return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
 }
 
 
diff --git a/PuReMD-GPU/src/reset_utils.c b/PuReMD-GPU/src/reset_utils.c
index f79596aa9d29a65f673448d18a28c73c00444e43..ecb921bb00255081bec5470baaab070df8cb80ef 100644
--- a/PuReMD-GPU/src/reset_utils.c
+++ b/PuReMD-GPU/src/reset_utils.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -28,18 +29,20 @@ void Reset_Atoms( reax_system* system )
 {
     int i;
 
-    for( i = 0; i < system->N; ++i )
-        memset( system->atoms[i].f, 0.0, RVEC_SIZE );
+    for ( i = 0; i < system->N; ++i )
+    {
+        memset( system->atoms[i].f, 0.0, sizeof(rvec) );
+    }
 }
 
 
 void Reset_Pressures( simulation_data *data )
 {
-    rtensor_MakeZero( data->flex_bar.P );  
+    rtensor_MakeZero( data->flex_bar.P );
     data->iso_bar.P = 0;
     rvec_MakeZero( data->int_press );
     rvec_MakeZero( data->ext_press );
-    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
+    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 }
 
@@ -97,49 +100,57 @@ void Reset_Workspace( reax_system *system, static_storage *workspace )
 }
 
 
-void Reset_Neighbor_Lists( reax_system *system, control_params *control, 
+void Reset_Neighbor_Lists( reax_system *system, control_params *control,
         static_storage *workspace, list **lists )
 {
     int i, tmp;
     list *bonds = (*lists) + BONDS;
     list *hbonds = (*lists) + HBONDS;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         tmp = Start_Index( i, bonds );
         Set_End_Index( i, tmp, bonds );
     }
 
-    //TODO check if this is needed
-    memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
+    //TODO: added for GPU, verify if correct
+    memset( bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
 
-    if( control->hb_cut > 0 )
-        for( i = 0; i < system->N; ++i )
-            if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
+    if ( control->hb_cut > 0 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            if ( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1)
+            {
                 tmp = Start_Index( workspace->hbond_index[i], hbonds );
                 Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
-                /* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
-                   i, Start_Index( workspace->hbond_index[i], hbonds ), 
+                /* fprintf( stderr, "i:%d, hbond: %d-%d\n",
+                   i, Start_Index( workspace->hbond_index[i], hbonds ),
                    End_Index( workspace->hbond_index[i], hbonds ) );*/
             }
+        }
+    }
 }
 
 
-void Reset( reax_system *system, control_params *control,  
+void Reset( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace, list **lists  )
 {
     Reset_Atoms( system );
 
     Reset_Simulation_Data( data );
 
-    if( control->ensemble == NPT || control->ensemble == sNPT || 
+    if ( control->ensemble == NPT || control->ensemble == sNPT ||
             control->ensemble == iNPT )
+    {
         Reset_Pressures( data );
+    }
 
-    Reset_Workspace( system, workspace );  
+    Reset_Workspace( system, workspace );
 
     Reset_Neighbor_Lists( system, control, workspace, lists );
 
-#if defined(DEBUG_FOCUS)  
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "reset - ");
 #endif
 }
@@ -147,16 +158,18 @@ void Reset( reax_system *system, control_params *control,
 
 void Reset_Grid( grid *g )
 {
-    memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
+    memset( g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2] );
 }
 
 
+
 void Reset_Marks( grid *g, ivec *grid_stack, int grid_top )
 {
     int i;
 
-    for( i = 0; i < grid_top; ++i )
-        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
-            grid_stack[i][1] * g->ncell[2] + 
-            grid_stack[i][2]] = 0;
+    for ( i = 0; i < grid_top; ++i )
+    {
+        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2]
+            + grid_stack[i][1] * g->ncell[2] + grid_stack[i][2]] = 0;
+    }
 }
diff --git a/PuReMD-GPU/src/restart.c b/PuReMD-GPU/src/restart.c
index b6ccb014d91ad33cd337d7688345d06811b2c681..13abdecc8142c5f40b942b79c2e886246372576c 100644
--- a/PuReMD-GPU/src/restart.c
+++ b/PuReMD-GPU/src/restart.c
@@ -1,9 +1,10 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
@@ -19,11 +20,12 @@
   ----------------------------------------------------------------------*/
 
 #include "restart.h"
+
 #include "box.h"
 #include "vector.h"
 
 void Write_Binary_Restart( reax_system *system, control_params *control,
-                           simulation_data *data, static_storage *workspace )
+        simulation_data *data, static_storage *workspace )
 {
     int  i;
     char fname[MAX_STR];
@@ -65,8 +67,8 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
 
 
 void Read_Binary_Restart( char *fname, reax_system *system,
-                          control_params *control, simulation_data *data,
-                          static_storage *workspace )
+        control_params *control, simulation_data *data,
+        static_storage *workspace )
 {
     int i;
     FILE *fres;
@@ -103,14 +105,13 @@ void Read_Binary_Restart( char *fname, reax_system *system,
 
     workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
+    {
         workspace->map_serials[i] = -1;
+    }
 
     workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //CHANGE
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -175,8 +176,7 @@ void Write_ASCII_Restart( reax_system *system, control_params *control,
 
 
 void Read_ASCII_Restart( char *fname, reax_system *system,
-                         control_params *control, simulation_data *data,
-                         static_storage *workspace )
+        control_params *control, simulation_data *data, static_storage *workspace )
 {
     int i;
     FILE *fres;
@@ -185,8 +185,7 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
     fres = fopen( fname, "r" );
 
     /* header */
-    //fscanf( fres, READ_RESTART_HEADER,
-    fscanf( fres, RESTART_HEADER,
+    fscanf( fres, READ_RESTART_HEADER,
             &data->prev_steps, &system->N, &data->therm.T, &data->therm.xi,
             &data->therm.v_xi, &data->therm.v_xi_old, &data->therm.G_xi,
             &system->box.box[0][0], &system->box.box[0][1], &system->box.box[0][2],
@@ -194,7 +193,7 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
             &system->box.box[2][0], &system->box.box[2][1], &system->box.box[2][2]);
     Make_Consistent( &(system->box) );
 
-//#if defined(DEBUG_FOCUS)
+#if defined(DEBUG_FOCUS)
     fprintf( stderr, "restart step: %d\n", data->prev_steps );
     fprintf( stderr, "restart thermostat: %10.6f %10.6f %10.6f %10.6f %10.6f\n",
              data->therm.T, data->therm.xi,
@@ -204,22 +203,20 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
              system->box.box[0][0], system->box.box[0][1], system->box.box[0][2],
              system->box.box[1][0], system->box.box[1][1], system->box.box[1][2],
              system->box.box[2][0], system->box.box[2][1], system->box.box[2][2] );
-    fprintf ( stderr, "Total Atoms read: %d \n", system->N);
-//#endif
+#endif
 
     /* memory allocations for atoms, atom maps, bond restrictions */
     system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
 
     workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
+    {
         workspace->map_serials[i] = -1;
+    }
 
     workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
     workspace->restricted_list = (int*) calloc( system->N * MAX_RESTRICT, sizeof(int) );
-    //CHANGE
-    //for( i = 0; i < system->N; ++i )
-    // workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -240,11 +237,15 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
 
 
 void Write_Restart( reax_system *system, control_params *control,
-                    simulation_data *data, static_storage *workspace,
-                    output_controls *out_control )
+        simulation_data *data, static_storage *workspace, output_controls
+        *out_control )
 {
     if ( out_control->restart_format == WRITE_ASCII )
+    {
         Write_ASCII_Restart( system, control, data, workspace );
+    }
     else if ( out_control->restart_format == WRITE_BINARY )
+    {
         Write_Binary_Restart( system, control, data, workspace );
+    }
 }
diff --git a/PuReMD-GPU/src/single_body_interactions.c b/PuReMD-GPU/src/single_body_interactions.c
index b26f493e703819f066389991a4845acab113b326..4c5824dd9862770863aa3e3299ca4f1f691c561e 100644
--- a/PuReMD-GPU/src/single_body_interactions.c
+++ b/PuReMD-GPU/src/single_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -27,11 +28,8 @@
 #include "vector.h"
 
 
-void LonePair_OverUnder_Coordination_Energy( reax_system *system, 
-        control_params *control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
+void LonePair_OverUnder_Coordination_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, j, pj, type_i, type_j;
@@ -49,7 +47,7 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     bond_data *pbond;
-    bond_order_data *bo_ij; 
+    bond_order_data *bo_ij;
     list *bonds = (*lists) + BONDS;
 
     /* Initialize parameters */
@@ -61,64 +59,71 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
     p_ovun7 = system->reaxprm.gp.l[8];
     p_ovun8 = system->reaxprm.gp.l[9];
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         /* set the parameter pointer */
         type_i = system->atoms[i].type;
         sbp_i = &(system->reaxprm.sbp[ type_i ]);
 
         /* lone-pair Energy */
-        p_lp2 = sbp_i->p_lp2;      
+        p_lp2 = sbp_i->p_lp2;
         expvd2 = EXP( -75 * workspace->Delta_lp[i] );
         inv_expvd2 = 1. / (1. + expvd2 );
 
         /* calculate the energy */
-        data->E_Lp += e_lp = 
-            p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+        data->E_Lp += e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
 
-        dElp = p_lp2 * inv_expvd2 + 
-            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+        dElp = p_lp2 * inv_expvd2 +
+               75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
         CElp = dElp * workspace->dDelta_lp[i];
 
         workspace->CdDelta[i] += CElp;      // lp - 1st term
 
 #ifdef TEST_ENERGY
-        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
-                p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n",
+                 p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
         fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
-                workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
+                 workspace->orig_id[i] + 1, workspace->nlp[i], e_lp, data->E_Lp );
 #endif
+
 #ifdef TEST_FORCES
         Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
 #endif
 
         /* correction for C2 */
-        if( system->reaxprm.gp.l[5] > 0.001 && 
+        if ( system->reaxprm.gp.l[5] > 0.001 &&
                 !strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
-            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-                if( i < bonds->select.bond_list[pj].nbr ) {
+        {
+            for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            {
+                if ( i < bonds->select.bond_list[pj].nbr )
+                {
                     j = bonds->select.bond_list[pj].nbr;
                     type_j = system->atoms[j].type;
 
-                    if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
+                    if ( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) )
+                    {
                         twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
                         bo_ij = &( bonds->select.bond_list[pj].bo_data );
                         Di = workspace->Delta[i];
-                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                        vov3 = bo_ij->BO - Di - 0.040 * POW(Di, 4.);
 
-                        if( vov3 > 3. ) {
-                            data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
+                        if ( vov3 > 3. )
+                        {
+                            data->E_Lp += e_lph = p_lp3 * SQR(vov3 - 3.0);
                             //estrain(i) += e_lph;
 
-                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                            deahu2dbo = 2.*p_lp3 * (vov3 - 3.);
+                            deahu2dsbo = 2.*p_lp3 * (vov3 - 3.) * (-1. - 0.16 * POW(Di, 3.));
 
                             bo_ij->Cdbo += deahu2dbo;
                             workspace->CdDelta[i] += deahu2dsbo;
 #ifdef TEST_ENERGY
-                            fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
+                            fprintf(out_control->elp, "C2cor%6d%6d%23.15e%23.15e%23.15e\n",
                                     // workspace->orig_id[i], workspace->orig_id[j],
-                                    i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
+                                    i + 1, j + 1, e_lph, deahu2dbo, deahu2dsbo );
 #endif
+
 #ifdef TEST_FORCES
                             Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
                             Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
@@ -127,44 +132,52 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
                     }
 
                 }
+            }
+        }
     }
 
-
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         type_i = system->atoms[i].type;
         sbp_i = &(system->reaxprm.sbp[ type_i ]);
 
         /* over-coordination energy */
-        if( sbp_i->mass > 21.0 ) 
+        if ( sbp_i->mass > 21.0 )
+        {
             dfvl = 0.0;
-        else dfvl = 1.0; // only for 1st-row elements
+        }
+        else
+        {
+            dfvl = 1.0; // only for 1st-row elements
+        }
 
         p_ovun2 = sbp_i->p_ovun2;
         sum_ovun1 = 0;
         sum_ovun2 = 0;
 
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             j = bonds->select.bond_list[pj].nbr;
-            type_j = system->atoms[j].type;      
+            type_j = system->atoms[j].type;
             bo_ij = &(bonds->select.bond_list[pj].bo_data);
             sbp_j = &(system->reaxprm.sbp[ type_j ]);
             twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
             sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-            sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-                ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
-              i+1, j+1, 
-              dfvl * workspace->Delta_lp_temp[j],
-              sbp_j->nlp_opt,
-              workspace->nlp_temp[j] );*/
+            sum_ovun2 += (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]) *
+                         ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+
+            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n",
+            i+1, j+1,
+            dfvl * workspace->Delta_lp_temp[j],
+            sbp_j->nlp_opt,
+            workspace->nlp_temp[j] );*/
         }
 
         exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
         inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-        Delta_lpcorr  = workspace->Delta[i] - 
-            (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+        Delta_lpcorr  = workspace->Delta[i] -
+            (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
         exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
         inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
@@ -175,11 +188,11 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         data->E_Ov += e_ov = sum_ovun1 * CEover1;
 
         CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-            ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+            ( 1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
 
-        CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+        CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 );
 
-        CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) *
             p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
@@ -193,14 +206,13 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
         inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-        data->E_Un += e_un =
-            -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+        data->E_Un += e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
-        CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+        CEunder1 = inv_exp_ovun2n * ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
                 p_ovun2 * e_un * exp_ovun2n);
         CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-        CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-        CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        CEunder3 = CEunder1 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1);
+        CEunder4 = CEunder1 * (dfvl * workspace->Delta_lp_temp[i]) *
             p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
         //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
@@ -215,100 +227,98 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
         Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
 #endif
 
-
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             pbond = &(bonds->select.bond_list[pj]);
             j = pbond->nbr;
             type_j = system->atoms[j].type;
             bo_ij = &(pbond->bo_data);
             twbp  = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
 
-
-            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
-            workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-                (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
-            bo_ij->Cdbopi += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-            bo_ij->Cdbopi2 += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st
+            workspace->CdDelta[j] += CEover4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                                     (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+            bo_ij->Cdbopi += CEover4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //OvCoor-3b
+            bo_ij->Cdbopi2 += CEover4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //OvCoor-3b
 
 
-            workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
+            workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
                 (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
-            bo_ij->Cdbopi += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-            bo_ij->Cdbopi2 += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-
+            bo_ij->Cdbopi += CEunder4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //UnCoor-2b
+            bo_ij->Cdbopi2 += CEunder4 *
+                (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]); //UnCoor-2b
 
 #ifdef TEST_ENERGY
             /* fprintf( out_control->eov, "%6d%23.15e%23.15e"
-               workspace->orig_id[j]+1,
-            //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
-            CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
-
-            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-              workspace->orig_id[j]+1, 
-              CEover4,
-              CEover4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
-              (1.0 - dfvl*workspace->dDelta_lp[j]),
-              CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            workspace->orig_id[j]+1,
+             //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
+             CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
+
+            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+            workspace->orig_id[j]+1,
+            CEover4,
+            CEover4*
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2),
+            (1.0 - dfvl*workspace->dDelta_lp[j]),
+            CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
+            (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
 
             /* fprintf( out_control->eun, "%6d%23.15e\n",
-               workspace->orig_id[j]+1, CEunder3 ); */
+            workspace->orig_id[j]+1, CEunder3 ); */
 
             /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-              workspace->orig_id[j]+1,
-              CEunder4,
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            workspace->orig_id[j]+1,
+            CEunder4,
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEunder4*
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+            CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+            (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
 #endif
 
 #ifdef TEST_FORCES
-            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
-                    workspace->f_ov ); // OvCoor - 1st term
+            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s,
+                     workspace->f_ov ); // OvCoor - 1st term
 
             Add_dDelta( system, lists, j,
-                    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
+                        CEover4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                        (bo_ij->BO_pi + bo_ij->BO_pi2), workspace->f_ov ); //OvCoor3a
 
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+            Add_dBOpinpi2( system, lists, i, pj,
+                           CEover4 * (workspace->Delta[j] -
+                                      dfvl * workspace->Delta_lp_temp[j]),
+                           CEover4 * (workspace->Delta[j] -
+                                      dfvl * workspace->Delta_lp_temp[j]),
+                           workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
 
             Add_dDelta( system, lists, j,
-                    CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi + bo_ij->BO_pi2),
-                    workspace->f_un ); // UnCoor - 2a
-
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_un, workspace->f_un ); // UnCoor - 2b
+                        CEunder4 * (1.0 - dfvl * workspace->dDelta_lp[j]) *
+                        (bo_ij->BO_pi + bo_ij->BO_pi2),
+                        workspace->f_un ); // UnCoor - 2a
+
+            Add_dBOpinpi2( system, lists, i, pj,
+                           CEunder4 * (workspace->Delta[j] -
+                                       dfvl * workspace->Delta_lp_temp[j]),
+                           CEunder4 * (workspace->Delta[j] -
+                                       dfvl * workspace->Delta_lp_temp[j]),
+                           workspace->f_un, workspace->f_un ); // UnCoor - 2b
 #endif
         }
 
-#ifdef TEST_ENERGY      
+#ifdef TEST_ENERGY
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
-                i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
+        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n",
+                 i + 1, DlpVi, Delta_lpcorr, sbp_i->valency );
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n",
+                 i + 1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n",
+                 i + 1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
 #endif
     }
 }
diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c
index 0126b86b776dce8fd30aea0c228731b95104b216..fc93a474cf378f1a382d0ae017cf15a9b23eb17a 100644
--- a/PuReMD-GPU/src/system_props.c
+++ b/PuReMD-GPU/src/system_props.c
@@ -1,67 +1,50 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
 #include "system_props.h"
-
-#include "box.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
-HOST real Get_Time( )
-{
-    struct timeval tim;
-
-    gettimeofday(&tim, NULL );
-    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
-}
-
-
-HOST real Get_Timing_Info( real t_start )
-{
-    struct timeval tim;
-    real t_end;
-
-    gettimeofday(&tim, NULL );
-    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
-    return (t_end - t_start);
-}
-
-
-void Temperature_Control( control_params *control, simulation_data *data, 
-        output_controls *out_control )
+void Temperature_Control( control_params *control, simulation_data *data,
+                          output_controls *out_control )
 {
     real tmp;
 
-    if( control->T_mode == 1 ) { // step-wise temperature control
-        if( (data->step - data->prev_steps) % 
-                ((int)(control->T_freq / control->dt)) == 0 ) {
-            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+    if ( control->T_mode == 1 )  // step-wise temperature control
+    {
+        if ( (data->step - data->prev_steps) %
+                ((int)(control->T_freq / control->dt)) == 0 )
+        {
+            if ( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
                 control->T += control->T_rate;
-            else control->T = control->T_final;     
+            else control->T = control->T_final;
         }
     }
-    else if( control->T_mode == 2 ) { // constant slope control
+    else if ( control->T_mode == 2 )  // constant slope control
+    {
         tmp = control->T_rate * control->dt / control->T_freq;
 
-        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
-            control->T += tmp;       
+        if ( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+            control->T += tmp;
     }
 }
 
@@ -69,39 +52,34 @@ void Temperature_Control( control_params *control, simulation_data *data,
 void Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
     int i;
-    int blocks;
-    int block_size;
-    real    *partial_sums = 0;
 
     data->M = 0;
 
-    for( i = 0; i < system->N; i++ ) 
-        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
+    for ( i = 0; i < system->N; i++ )
+        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
-    data->inv_M = 1. / data->M;    
+    //fprintf ( stderr, "Compute_total_Mass -->%f<-- \n", data->M );
+    data->inv_M = 1. / data->M;
 }
 
 
-void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-        FILE *fout )
+void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
+                             FILE *fout )
 {
     int i;
     real m, xx, xy, xz, yy, yz, zz, det;
     rvec tvec, diff;
     rtensor mat, inv;
 
-    int blocks;
-    int block_size;
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    real t_start, t_end;
-
     rvec_MakeZero( data->xcm );  // position of CoM
     rvec_MakeZero( data->vcm );  // velocity of CoM
     rvec_MakeZero( data->amcm ); // angular momentum of CoM
     rvec_MakeZero( data->avcm ); // angular velocity of CoM
 
+
     /* Compute the position, velocity and angular momentum about the CoM */
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
         rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
@@ -111,11 +89,13 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         rvec_ScaledAdd( data->amcm, m, tvec );
 
         /*fprintf( fout,"%3d  %g %g %g\n",
-          i+1, 
+          i+1,
           system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-          fprintf( fout, "vcm:  %g %g %g\n", 
-          data->vcm[0], data->vcm[1], data->vcm[2] );  
-         */
+          fprintf( fout, "vcm:  %g %g %g\n",
+          data->vcm[0], data->vcm[1], data->vcm[2] );
+        */
+        /* fprintf( stderr, "amcm: %12.6f %12.6f %12.6f\n",
+           data->amcm[0], data->amcm[1], data->amcm[2] ); */
     }
 
     rvec_Scale( data->xcm, data->inv_M, data->xcm );
@@ -129,7 +109,8 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     /* Calculate and then invert the inertial tensor */
     xx = xy = xz = yy = yz = zz = 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
         rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
@@ -138,19 +119,10 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         xz += diff[0] * diff[2] * m;
         yy += diff[1] * diff[1] * m;
         yz += diff[1] * diff[2] * m;
-        zz += diff[2] * diff[2] * m;      
+        zz += diff[2] * diff[2] * m;
     }
 
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " xx: %f \n", xx);
-    fprintf (stderr, " xy: %f \n", xy);
-    fprintf (stderr, " xz: %f \n", xz);
-    fprintf (stderr, " yy: %f \n", yy);
-    fprintf (stderr, " yz: %f \n", yz);
-    fprintf (stderr, " zz: %f \n", zz);
-#endif
-
-    mat[0][0] = yy + zz;     
+    mat[0][0] = yy + zz;
     mat[0][1] = mat[1][0] = -xy;
     mat[0][2] = mat[2][0] = -xz;
     mat[1][1] = xx + zz;
@@ -158,12 +130,12 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     mat[2][2] = xx + yy;
 
     /* invert the inertial tensor */
-    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-            mat[0][1] * mat[1][2] * mat[2][0] + 
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] +
+            mat[0][1] * mat[1][2] * mat[2][0] +
             mat[0][2] * mat[1][0] * mat[2][1] ) -
-        ( mat[0][0] * mat[1][2] * mat[2][1] + 
-          mat[0][1] * mat[1][0] * mat[2][2] + 
-          mat[0][2] * mat[1][1] * mat[2][0] );
+          ( mat[0][0] * mat[1][2] * mat[2][1] +
+            mat[0][1] * mat[1][0] * mat[2][2] +
+            mat[0][2] * mat[1][1] * mat[2][0] );
 
     inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
     inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
@@ -175,33 +147,33 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
     inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
-    if( fabs(det) > ALMOST_ZERO )
-        rtensor_Scale( inv, 1./det, inv );
-    else 
+    if ( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1. / det, inv );
+    else
         rtensor_MakeZero( inv );
 
     /* Compute the angular velocity about the centre of mass */
-    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    rtensor_MatVec( data->avcm, inv, data->amcm );
     data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
 
 #if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-            data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-            data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-            data->amcm[0], data->amcm[1], data->amcm[2] );
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
+             data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
+             data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
+             data->amcm[0], data->amcm[1], data->amcm[2] );
     /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2], 
-       mat[1][0], mat[1][1], mat[1][2], 
+       mat[0][0], mat[0][1], mat[0][2],
+       mat[1][0], mat[1][1], mat[1][2],
        mat[2][0], mat[2][1], mat[2][2] );
        fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2], 
-       inv[1][0], inv[1][1], inv[1][2], 
+       inv[0][0], inv[0][1], inv[0][2],
+       inv[1][0], inv[1][1], inv[1][2],
        inv[2][0], inv[2][1], inv[2][2] );
        fflush( fout ); */
-    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-            data->avcm[0], data->avcm[1], data->avcm[2] );
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n",
+             data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
 
@@ -214,7 +186,8 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 
     data->E_Kin = 0.0;
 
-    for (i=0; i < system->N; i++) {
+    for (i = 0; i < system->N; i++)
+    {
         m = system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_Scale( p, m, system->atoms[i].v );
@@ -232,17 +205,17 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 }
 
 
-/* IMPORTANT: This function assumes that current kinetic energy and 
- *  the center of mass of the system is already computed before. 
+/* IMPORTANT: This function assumes that current kinetic energy and
+ *  the center of mass of the system is already computed before.
  *
- * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
- *  to be added when there are long-range interactions or long-range 
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+ *  to be added when there are long-range interactions or long-range
  *  corrections to short-range interactions present.
- *  We may want to add that for more accuracy. 
+ *  We may want to add that for more accuracy.
  */
-void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
-        simulation_data* data, 
-        output_controls *out_control )
+void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
+                                 simulation_data* data,
+                                 output_controls *out_control )
 {
     int i;
     reax_atom *p_atom;
@@ -254,8 +227,10 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
     rvec_MakeZero( data->int_press );
 
     // 0: both int and ext, 1: ext only, 2: int only
-    if( control->press_mode == 0 || control->press_mode == 2 ) {
-        for( i = 0; i < system->N; ++i ) {
+    if ( control->press_mode == 0 || control->press_mode == 2 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
             p_atom = &( system->atoms[i] );
 
             /* transform x into unitbox coordinates */
@@ -265,13 +240,14 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
             rvec_Multiply( tmp, p_atom->f, tx );
             rvec_Add( data->int_press, tmp );
 
-            if( out_control->debug_level > 0 ) {
-                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
-                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
-                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
-                        data->int_press[0],data->int_press[1],data->int_press[2]);
+            if ( out_control->debug_level > 0 )
+            {
+                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f",
+                         i + 1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f",
+                         p_atom->f[0], p_atom->f[1], p_atom->f[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n",
+                         data->int_press[0], data->int_press[1], data->int_press[2]);
             }
         }
     }
@@ -279,53 +255,53 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
     /* kinetic contribution */
     data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
 
-    /* Calculate total pressure in each direction */  
-    data->tot_press[0] = data->kin_press - 
-        ((data->int_press[0] + data->ext_press[0]) /
-         (box->box_norms[1] * box->box_norms[2] * P_CONV));
+    /* Calculate total pressure in each direction */
+    data->tot_press[0] = data->kin_press -
+                         ((data->int_press[0] + data->ext_press[0]) /
+                          (box->box_norms[1] * box->box_norms[2] * P_CONV));
 
-    data->tot_press[1] = data->kin_press - 
-        ((data->int_press[1] + data->ext_press[1])/
-         (box->box_norms[0] * box->box_norms[2] * P_CONV));
+    data->tot_press[1] = data->kin_press -
+                         ((data->int_press[1] + data->ext_press[1]) /
+                          (box->box_norms[0] * box->box_norms[2] * P_CONV));
 
-    data->tot_press[2] = data->kin_press - 
-        ((data->int_press[2] + data->ext_press[2])/
-         (box->box_norms[0] * box->box_norms[1] * P_CONV));
+    data->tot_press[2] = data->kin_press -
+                         ((data->int_press[2] + data->ext_press[2]) /
+                          (box->box_norms[0] * box->box_norms[1] * P_CONV));
 
     /* Average pressure for the whole box */
-    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
+    data->iso_bar.P = (data->tot_press[0] + data->tot_press[1] + data->tot_press[2]) / 3;
 }
 
 
-void Compute_Pressure_Isotropic_Klein( reax_system* system, 
-        simulation_data* data )
+void Compute_Pressure_Isotropic_Klein( reax_system* system,
+                                       simulation_data* data )
 {
     int i;
     reax_atom *p_atom;
     rvec dx;
 
-    // IMPORTANT: This function assumes that current kinetic energy and 
+    // IMPORTANT: This function assumes that current kinetic energy and
     // the center of mass of the system is already computed before.
     data->iso_bar.P = 2.0 * data->E_Kin;
 
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         p_atom = &( system->atoms[i] );
-        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
+        rvec_ScaledSum(dx, 1.0, p_atom->x, -1.0, data->xcm);
         data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
     }
 
     data->iso_bar.P /= (3.0 * system->box.volume);
 
-    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
-    // to be added when there are long-range interactions or long-range 
+    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+    // to be added when there are long-range interactions or long-range
     // corrections to short-range interactions present.
     // We may want to add that for more accuracy.
 }
 
 
-void Compute_Pressure( reax_system* system, simulation_data* data, 
-        static_storage *workspace )
+void Compute_Pressure( reax_system* system, simulation_data* data,
+                       static_storage *workspace )
 {
     int i;
     reax_atom *p_atom;
@@ -333,13 +309,14 @@ void Compute_Pressure( reax_system* system, simulation_data* data,
 
     rtensor_MakeZero( data->flex_bar.P );
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         p_atom = &( system->atoms[i] );
         // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
         rvec_OuterProduct( temp, p_atom->v, p_atom->v );
-        rtensor_ScaledAdd( data->flex_bar.P, 
-                system->reaxprm.sbp[ p_atom->type ].mass, temp );
-        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
+        rtensor_ScaledAdd( data->flex_bar.P,
+                           system->reaxprm.sbp[ p_atom->type ].mass, temp );
+        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x );
         rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
     }
 
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index 874132451d02b2d62d87c82065874f04a35b2d37..e2cc98350167a763ac6acdac5807d710403210bd 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -28,10 +28,6 @@
 extern "C"  {
 #endif
 
-real Get_Time( );
-
-real Get_Timing_Info( real );
-
 void Temperature_Control( control_params*, simulation_data*, output_controls* );
 
 void Compute_Total_Mass( reax_system*, simulation_data* );
diff --git a/PuReMD-GPU/src/testmd.c b/PuReMD-GPU/src/testmd.c
index 57d8859df4645f982b93803415cb408c57a564f7..b5204950ee733f30cd69c13d28c5cfd6b2200246 100644
--- a/PuReMD-GPU/src/testmd.c
+++ b/PuReMD-GPU/src/testmd.c
@@ -22,20 +22,23 @@
 
 #include "analyze.h"
 #include "box.h"
+#include "control.h"
+#include "ffield.h"
 #include "forces.h"
+#include "geo_tools.h"
 #include "grid.h"
 #include "init_md.h"
 #include "integrate.h"
 #include "neighbors.h"
-#include "param.h"
-#include "pdb_tools.h"
 #include "print_utils.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
 #include "traj.h"
+#include "tool_box.h"
 #include "vector.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_environment.h"
 #include "cuda_forces.h"
 #include "cuda_init_md.h"
@@ -43,9 +46,9 @@
 #include "cuda_post_evolve.h"
 #include "cuda_reset_utils.h"
 #include "cuda_system_props.h"
-
 #ifdef __BUILD_DEBUG__
-  #include "validation.h"
+  #include "cuda_validation.h"
+#endif
 #endif
 
 
@@ -69,9 +72,12 @@ int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 int MATVEC_BLOCKS;
 
 
-void Post_Evolve( reax_system* system, control_params* control, 
-        simulation_data* data, static_storage* workspace, 
-        list** lists, output_controls *out_control )
+static void Post_Evolve( reax_system * const system,
+        control_params * const control,
+        simulation_data * const data,
+        static_storage * const workspace,
+        list ** const lists,
+        output_controls * const out_control )
 {
     int i;
     rvec diff, cross;
@@ -110,15 +116,27 @@ void Post_Evolve( reax_system* system, control_params* control,
 }
 
 
-void Read_System( char *geof, char *ff, char *ctrlf, 
-        reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        output_controls *out_control )
+void static Read_System( char * const geo_file,
+        char * const ffield_file,
+        char * const control_file,
+        reax_system * const system,
+        control_params * const control,
+        simulation_data * const data,
+        static_storage * const workspace,
+        output_controls * const out_control )
 {
     FILE *ffield, *ctrl;
 
-    ffield = fopen( ff, "r" );
-    ctrl = fopen( ctrlf, "r" );
+    if ( (ffield = fopen( ffield_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the ffield file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
+    if ( (ctrl = fopen( control_file, "r" )) == NULL )
+    {
+        fprintf( stderr, "Error opening the ffield file!\n" );
+        exit( FILE_NOT_FOUND );
+    }
 
     /* ffield file */
     Read_Force_Field( ffield, &(system->reaxprm) );
@@ -127,32 +145,31 @@ void Read_System( char *geof, char *ff, char *ctrlf,
     Read_Control_File( ctrl, system, control, out_control );
 
     /* geo file */
-    if( control->geo_format == XYZ )
+    if( control->geo_format == CUSTOM )
     {
-        fprintf( stderr, "xyz input is not implemented yet\n" );
-        exit( 1 );
+        Read_Geo( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == PDB ) 
     {
-        Read_PDB( geof, system, control, data, workspace );
+        Read_PDB( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == BGF ) 
     {
-        Read_BGF( geof, system, control, data, workspace );
+        Read_BGF( geo_file, system, control, data, workspace );
     }
     else if( control->geo_format == ASCII_RESTART )
     {
-        Read_ASCII_Restart( geof, system, control, data, workspace );
+        Read_ASCII_Restart( geo_file, system, control, data, workspace );
         control->restart = 1;
     }
     else if( control->geo_format == BINARY_RESTART ) {
-        Read_Binary_Restart( geof, system, control, data, workspace );
+        Read_Binary_Restart( geo_file, system, control, data, workspace );
         control->restart = 1;
     }
     else
     {
         fprintf( stderr, "unknown geo file format. terminating!\n" );
-        exit( 1 );
+        exit( INVALID_GEO );
     }  
 
 #if defined(DEBUG_FOCUS)
@@ -172,7 +189,14 @@ void Init_Data_Structures( simulation_data *data )
 }
 
 
-int main( int argc, char* argv[] )
+static void usage(char* argv[])
+{
+    fprintf(stderr, "usage: ./%s geometry ffield control\n", argv[0]);
+}
+
+
+#ifdef HAVE_CUDA
+static void gpu_main( int argc, char* argv[] )
 {
     reax_system system;
     control_params control;
@@ -183,7 +207,6 @@ int main( int argc, char* argv[] )
     evolve_function Evolve;
     evolve_function Cuda_Evolve;
     int steps;
-
     real t_start, t_elapsed;
     real *results = NULL;
 
@@ -259,8 +282,8 @@ int main( int argc, char* argv[] )
 #ifdef __BUILD_DEBUG__
     if( !validate_device (&system, &data, &workspace, &lists) )
     {
-        fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
-        exit (1);
+        fprintf( stderr, " Results does not match between Device and host @ step --> %d \n", data.step );
+        exit( 1 );
     }
 #endif
 
@@ -331,6 +354,92 @@ int main( int argc, char* argv[] )
     fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
 
     Cleanup_Cuda_Environment( );
+}
+
+
+#else
+static void cpu_main( int argc, char* argv[] )
+{
+    reax_system system;
+    control_params control;
+    simulation_data data;
+    static_storage workspace;
+    list *lists;
+    output_controls out_control;
+    evolve_function Evolve;
+    int steps;
+
+    if ( argc != 4 )
+    {
+        usage(argv);
+        exit( INVALID_INPUT );
+    }
+
+    lists = (list*) malloc( sizeof(list) * LIST_N );
+
+    Read_System( argv[1], argv[2], argv[3], &system, &control,
+            &data, &workspace, &out_control );
+
+    Initialize( &system, &control, &data, &workspace, &lists,
+            &out_control, &Evolve );
+
+    /* compute f_0 */
+    //if( control.restart == 0 ) {
+    Reset( &system, &control, &data, &workspace, &lists );
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace,
+            &lists, &out_control );
+
+    //fprintf( stderr, "total: %.2f secs\n", data.timing.nbrs);
+    Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
+    Compute_Kinetic_Energy( &system, &data );
+    Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+    ++data.step;
+    //}
+
+
+    for ( ; data.step <= control.nsteps; data.step++ )
+    {
+        if ( control.T_mode )
+        {
+            Temperature_Control( &control, &data, &out_control );
+        }
+        Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+        Analysis( &system, &control, &data, &workspace, &lists, &out_control );
+
+        steps = data.step - data.prev_steps;
+        if ( steps && out_control.restart_freq &&
+                steps % out_control.restart_freq == 0 )
+            Write_Restart( &system, &control, &data, &workspace, &out_control );
+    }
+
+    if ( out_control.write_steps > 0 )
+    {
+        fclose( out_control.trj );
+        Write_PDB( &system, &(lists[BONDS]), &data, &control, &workspace, &out_control );
+    }
+
+    data.timing.end = Get_Time( );
+    data.timing.elapsed = Get_Timing_Info( data.timing.start );
+    fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
+}
+#endif
+
+
+int main( int argc, char* argv[] )
+{
+    if ( argc != 4 )
+    {
+        usage(argv);
+        exit( INVALID_INPUT );
+    }
+
+#ifdef HAVE_CUDA
+    gpu_main( argc, argv );
+#else
+    cpu_main( argc, argv );
+#endif
 
-    return 0;
+    return SUCCESS;
 }
diff --git a/PuReMD-GPU/src/three_body_interactions.c b/PuReMD-GPU/src/three_body_interactions.c
index 7ac96e057c6c799ba88204f3f6339fe54b3c61da..f128d2a2749ead3d5b9a08e47f45c0538255caac 100644
--- a/PuReMD-GPU/src/three_body_interactions.c
+++ b/PuReMD-GPU/src/three_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,59 +22,63 @@
 #include "three_body_interactions.h"
 
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
 /* calculates the theta angle between i-j-k */
-void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
         real *theta, real *cos_theta )
 {
-    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-    if( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if( *cos_theta < -1. ) *cos_theta  = -1.0;
+    (*cos_theta) = rvec_Dot( dvec_ji, dvec_jk ) / ( d_ji * d_jk );
+    if ( *cos_theta > 1. )
+    {
+        *cos_theta  = 1.0;
+    }
+    if ( *cos_theta < -1. )
+    {
+        *cos_theta  = -1.0;
+    }
 
     (*theta) = ACOS( *cos_theta );
 }
 
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
-void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
-        rvec* dcos_theta_dk )
+void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, rvec* dcos_theta_dk )
 {
     int  t;
     real sqr_d_ji   = SQR(d_ji);
     real sqr_d_jk   = SQR(d_jk);
     real inv_dists  = 1.0 / (d_ji * d_jk);
     real inv_dists3 = POW( inv_dists, 3 );
-    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
+    real dot_dvecs  = rvec_Dot( dvec_ji, dvec_jk );
     real Cdot_inv3  = dot_dvecs * inv_dists3;
 
-    for( t = 0; t < 3; ++t ) {
-        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+    for ( t = 0; t < 3; ++t )
+    {
+        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - Cdot_inv3 * sqr_d_jk * dvec_ji[t];
 
-        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists
+            + Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
 
-        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - Cdot_inv3 * sqr_d_ji * dvec_jk[t];
     }
 
-    /*fprintf( stderr, 
+    /*fprintf( stderr,
       "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
       dvec_jk[t] * inv_dists*/
 }
 
 
-/* this is a 3-body interaction in which the main role is 
+/* this is a 3-body interaction in which the main role is
    played by j which sits in the middle of the other two. */
-void Three_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
+void Three_Body_Interactions( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int  i, j, pi, k, pk, t;
     int  type_i, type_j, type_k;
@@ -123,7 +128,8 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
     p_val10 = system->reaxprm.gp.l[17];
     num_thb_intrs = 0;
 
-    for( j = 0; j < system->N; ++j ) {
+    for ( j = 0; j < system->N; ++j )
+    {
         // fprintf( out_control->eval, "j: %d\n", j );
         type_j = system->atoms[j].type;
         start_j = Start_Index(j, bonds);
@@ -133,21 +139,24 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
 
         SBOp = 0, prod_SBO = 1;
-        for( t = start_j; t < end_j; ++t ) {
+        for ( t = start_j; t < end_j; ++t )
+        {
             bo_jt = &(bond_list[t].bo_data);
             SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
             temp = SQR( bo_jt->BO );
-            temp *= temp; 
+            temp *= temp;
             temp *= temp;
             prod_SBO *= EXP( -temp );
         }
 
         /* modifications to match Adri's code - 09/01/09 */
-        if( workspace->vlpex[j] >= 0 ){
+        if ( workspace->vlpex[j] >= 0 )
+        {
             vlpadj = 0;
             dSBO2 = prod_SBO - 1;
         }
-        else{
+        else
+        {
             vlpadj = workspace->nlp[j];
             dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
         }
@@ -155,65 +164,67 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
         dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
 
-        if( SBO <= 0 )
+        if ( SBO <= 0 )
             SBO2 = 0, CSBO2 = 0;
-        else if( SBO > 0 && SBO <= 1 ) {
+        else if ( SBO > 0 && SBO <= 1 )
+        {
             SBO2 = POW( SBO, p_val9 );
             CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
         }
-        else if( SBO > 1 && SBO < 2 ) {
-            SBO2 = 2 - POW( 2-SBO, p_val9 );
+        else if ( SBO > 1 && SBO < 2 )
+        {
+            SBO2 = 2 - POW( 2 - SBO, p_val9 );
             CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
         }
-        else 
-            SBO2 = 2, CSBO2 = 0;  
+        else
+            SBO2 = 2, CSBO2 = 0;
 
         expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
 
-        /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-           restrictions here. such a restriction would prevent us from producing 
+        /* unlike 2-body intrs where we enforce i<j, we cannot put any such
+           restrictions here. such a restriction would prevent us from producing
            all 4-body intrs correctly */
-        for( pi = start_j; pi < end_j; ++pi ) {
+        for ( pi = start_j; pi < end_j; ++pi )
+        {
             Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-
             pbond_ij = &(bond_list[pi]);
             bo_ij = &(pbond_ij->bo_data);
             BOA_ij = bo_ij->BO - control->thb_cut;
 
 
-            if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
+            if ( BOA_ij/*bo_ij->BO*/ > 0.0 )
+            {
                 i = pbond_ij->nbr;
-                r_ij = pbond_ij->d;     
+                r_ij = pbond_ij->d;
                 type_i = system->atoms[i].type;
                 // fprintf( out_control->eval, "i: %d\n", i );
 
 
                 /* first copy 3-body intrs from previously computed ones where i>k.
-                   IMPORTANT: if it is less costly to compute theta and its 
-                   derivative, we should definitely re-compute them, 
+                   IMPORTANT: if it is less costly to compute theta and its
+                   derivative, we should definitely re-compute them,
                    instead of copying!
-                   in the second for-loop below, we compute only new 3-body intrs 
+                   in the second for-loop below, we compute only new 3-body intrs
                    where i < k */
-                for( pk = start_j; pk < pi; ++pk ) {
+                for ( pk = start_j; pk < pi; ++pk )
+                {
                     // fprintf( out_control->eval, "pk: %d\n", pk );
                     start_pk = Start_Index( pk, thb_intrs );
                     end_pk = End_Index( pk, thb_intrs );
 
-                    for( t = start_pk; t < end_pk; ++t )
-                        if( thb_list[t].thb == i ) {
+                    for ( t = start_pk; t < end_pk; ++t )
+                        if ( thb_list[t].thb == i )
+                        {
                             p_ijk = &(thb_list[num_thb_intrs]);
                             p_kji = &(thb_list[t]);
 
                             p_ijk->thb = bond_list[pk].nbr;
                             p_ijk->pthb  = pk;
-                            p_ijk->theta = p_kji->theta;              
+                            p_ijk->theta = p_kji->theta;
                             rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
                             rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
                             rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
 
-                            //if (j == 12)
-                            //fprintf (stderr, "Adding one for matched atom %d \n", i);
-
                             ++num_thb_intrs;
                             break;
                         }
@@ -221,7 +232,8 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
 
                 /* and this is the second for loop mentioned above */
-                for( pk = pi+1; pk < end_j; ++pk ) {
+                for ( pk = pi + 1; pk < end_j; ++pk )
+                {
                     pbond_jk = &(bond_list[pk]);
                     bo_jk    = &(pbond_jk->bo_data);
                     BOA_jk   = bo_jk->BO - control->thb_cut;
@@ -229,53 +241,55 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                     type_k   = system->atoms[k].type;
                     p_ijk    = &( thb_list[num_thb_intrs] );
 
-                    //TODO - CHANGE ORIGINAL
+                    //CHANGE ORIGINAL
                     if (BOA_jk <= 0) continue;
+                    //CHANGE ORIGINAL
 
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d,
-                            &theta, &cos_theta );
 
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d, 
-                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-                            &(p_ijk->dcos_dk) );
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
+                                     pbond_jk->dvec, pbond_jk->d,
+                                     &theta, &cos_theta );
+
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
+                                          pbond_jk->dvec, pbond_jk->d,
+                                          &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                                          &(p_ijk->dcos_dk) );
 
                     p_ijk->thb = k;
                     p_ijk->pthb = pk;
                     p_ijk->theta = theta;
 
-                    //if (j == 12)
-                    //fprintf (stderr, "Adding one for the rest %d \n", k);
-
                     sin_theta = SIN( theta );
-                    if( sin_theta < 1.0e-5 )
+                    if ( sin_theta < 1.0e-5 )
                         sin_theta = 1.0e-5;
 
                     ++num_thb_intrs;
 
 
-                    if( BOA_jk > 0.0 && 
-                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-                        r_jk = pbond_jk->d;              
+                    if ( BOA_jk > 0.0 &&
+                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/)
+                    {
+                        r_jk = pbond_jk->d;
                         thbh = &( system->reaxprm.thbp[ index_thbp(type_i,type_j,type_k,system->reaxprm.num_atom_types) ] );
                         flag = 0;
 
                         /* if( workspace->orig_id[i] < workspace->orig_id[k] )
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
                            workspace->orig_id[i], workspace->orig_id[j],
                            workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                           else 
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           else
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
                            workspace->orig_id[k], workspace->orig_id[j],
                            workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
 
 
-                        for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-                            // fprintf( out_control->eval, 
+                        for ( cnt = 0; cnt < thbh->cnt; ++cnt )
+                        {
+                            // fprintf( out_control->eval,
                             // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
 
-                            if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                            if ( fabs(thbh->prm[cnt].p_val1) > 0.001 )
+                            {
                                 thbp = &( thbh->prm[cnt] );
 
                                 /* ANGLE ENERGY */
@@ -287,27 +301,27 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
                                 exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
                                 f7_ij = 1.0 - exp3ij;
-                                Cf7ij = p_val3 * p_val4 * 
-                                    POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+                                Cf7ij = p_val3 * p_val4 *
+                                        POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
 
                                 exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
                                 f7_jk = 1.0 - exp3jk;
-                                Cf7jk = p_val3 * p_val4 * 
-                                    POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+                                Cf7jk = p_val3 * p_val4 *
+                                        POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
 
                                 expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
                                 trm8 = 1.0 + expval6 + expval7;
                                 f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
                                 Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-                                    (p_val6 * expval6 * trm8 - 
-                                     (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+                                       (p_val6 * expval6 * trm8 -
+                                        (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
 
-                                theta_0 = 180.0 - 
-                                    theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                                theta_0 = DEG2RAD( theta_0 );              
+                                theta_0 = 180.0 -
+                                          theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                                theta_0 = DEG2RAD( theta_0 );
 
-                                expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-                                if( p_val1 >= 0 )
+                                expval2theta  = EXP(-p_val2 * SQR(theta_0 - theta));
+                                if ( p_val1 >= 0 )
                                     expval12theta = p_val1 * (1.0 - expval2theta);
                                 else // To avoid linear Me-H-Me angles (6/6/06)
                                     expval12theta = p_val1 * -expval2theta;
@@ -315,11 +329,11 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
                                 CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
                                 CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-                                    expval2theta * (theta_0 - theta);
+                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
+                                         expval2theta * (theta_0 - theta);
 
-                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-                                    exp( -p_val10 * (2.0 - SBO2) );
+                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
+                                           exp( -p_val10 * (2.0 - SBO2) );
 
                                 CEval5 = -CEval4 * Ctheta_0 * CSBO2;
                                 CEval6 = CEval5 * dSBO1;
@@ -342,13 +356,13 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
                                 trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
                                 f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 -
                                         (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-                                            p_pen4 * exp_pen4 )) /
-                                    SQR( trm_pen34 );
+                                                             p_pen4 * exp_pen4 )) /
+                                       SQR( trm_pen34 );
 
-                                data->E_Pen += e_pen = 
-                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                data->E_Pen += e_pen =
+                                                   p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
 
                                 CEpen1 = e_pen * Cf9j / f9_Dj;
                                 temp   = -2.0 * p_pen2 * e_pen;
@@ -364,66 +378,64 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 p_coa4 = system->reaxprm.gp.l[30];
 
                                 exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-                                data->E_Coa += e_coa = 
-                                    p_coa1 / (1. + exp_coa2) *
-                                    EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-                                    EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                data->E_Coa += e_coa =
+                                                   p_coa1 / (1. + exp_coa2) *
+                                                   EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) *
+                                                   EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) *
+                                                   EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                                                   EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
 
                                 CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
                                 CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-                                CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-                                CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                                CEcoa4 = -2 * p_coa3 * (total_bo[i] - BOA_ij) * e_coa;
+                                CEcoa5 = -2 * p_coa3 * (total_bo[k] - BOA_jk) * e_coa;
                                 /* END COALITION ENERGY */
 
                                 /* FORCES */
-                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
-                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
-                                workspace->CdDelta[j] += ((CEval3 + CEval7) + 
-                                        CEpen1 + CEcoa3);
+                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                                workspace->CdDelta[j] += ((CEval3 + CEval7) +
+                                                          CEpen1 + CEcoa3);
                                 workspace->CdDelta[i] += CEcoa4;
-                                workspace->CdDelta[k] += CEcoa5;              
+                                workspace->CdDelta[k] += CEcoa5;
 
-                                for( t = start_j; t < end_j; ++t ) {
+                                for ( t = start_j; t < end_j; ++t )
+                                {
                                     pbond_jt = &( bond_list[t] );
                                     bo_jt = &(pbond_jt->bo_data);
                                     temp_bo_jt = bo_jt->BO;
                                     temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    pBOjt7 = temp * temp * temp_bo_jt;
 
-                                    // fprintf( out_control->eval, "%6d%12.8f\n", 
-                                    // workspace->orig_id[ bond_list[t].nbr ], 
+                                    // fprintf( out_control->eval, "%6d%12.8f\n",
+                                    // workspace->orig_id[ bond_list[t].nbr ],
                                     //    (CEval6 * pBOjt7) );
 
                                     bo_jt->Cdbo += (CEval6 * pBOjt7);
                                     bo_jt->Cdbopi += CEval5;
                                     bo_jt->Cdbopi2 += CEval5;
-                                }              
-
+                                }
 
-                                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
 
+                                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                                {
                                     rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
                                     rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
                                     rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
-
-                                    /*
-                                       if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
-                                       if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
-                                     */
                                 }
-                                else {
+                                else
+                                {
                                     /* terms not related to bond order derivatives
-                                       are added directly into 
+                                       are added directly into
                                        forces and pressure vector/tensor */
                                     rvec_Scale( force, CEval8, p_ijk->dcos_di );
                                     rvec_Add( system->atoms[i].f, force );
                                     rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
                                     rvec_Add( data->ext_press, ext_press );
 
-                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( system->atoms[j].f,
+                                                    CEval8, p_ijk->dcos_dj );
 
                                     rvec_Scale( force, CEval8, p_ijk->dcos_dk );
                                     rvec_Add( system->atoms[k].f, force );
@@ -432,96 +444,97 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 
 
                                     /* This part is for a fully-flexible box */
-                                    /* rvec_OuterProduct( temp_rtensor, 
+                                    /* rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_di, system->atoms[i].x );
                                        rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
 
-                                       rvec_OuterProduct( temp_rtensor, 
+                                       rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_dj, system->atoms[j].x );
                                        rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
 
-                                       rvec_OuterProduct( temp_rtensor, 
+                                       rvec_OuterProduct( temp_rtensor,
                                        p_ijk->dcos_dk, system->atoms[k].x );
                                        rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
 
                                        if( pbond_ij->imaginary || pbond_jk->imaginary )
-                                       rtensor_ScaledAdd( data->flex_bar.P, 
+                                       rtensor_ScaledAdd( data->flex_bar.P,
                                        -1.0, total_rtensor );
                                        else
                                        rtensor_Add( data->flex_bar.P, total_rtensor ); */
                                 }
 
 #ifdef TEST_ENERGY
-                                fprintf( out_control->eval, 
-                                        //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-                                        i+1, j+1, k+1,
-                                        //workspace->orig_id[i]+1,  
-                                        //workspace->orig_id[j]+1,
-                                        //workspace->orig_id[k]+1,
-                                        //workspace->Delta_boc[j], 
-                                        RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-                                        e_ang, data->E_Ang );
-
-                                /*fprintf( out_control->eval, 
+                                fprintf( out_control->eval,
+                                         //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                                         "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                                         i + 1, j + 1, k + 1,
+                                         //workspace->orig_id[i]+1,
+                                         //workspace->orig_id[j]+1,
+                                         //workspace->orig_id[k]+1,
+                                         //workspace->Delta_boc[j],
+                                         RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                                         e_ang, data->E_Ang );
+
+                                /*fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e",
                                   p_val3, p_val4, BOA_ij, BOA_jk );
-                                  fprintf( out_control->eval, 
+                                  fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e",
                                   f7_ij, f7_jk, f8_Dj, expval12theta );
-                                  fprintf( out_control->eval, 
+                                  fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
                                   CEval1, CEval2, CEval3, CEval4, CEval5
-                                //CEval6, CEval7, CEval8  );*/
+                                  //CEval6, CEval7, CEval8  );*/
 
-                                /*fprintf( out_control->eval, 
+                                /*fprintf( out_control->eval,
                                   "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  -p_ijk->dcos_di[0]/sin_theta, 
-                                  -p_ijk->dcos_di[1]/sin_theta, 
-                                  -p_ijk->dcos_di[2]/sin_theta, 
-                                  -p_ijk->dcos_dj[0]/sin_theta, 
-                                  -p_ijk->dcos_dj[1]/sin_theta, 
-                                  -p_ijk->dcos_dj[2]/sin_theta, 
-                                  -p_ijk->dcos_dk[0]/sin_theta, 
-                                  -p_ijk->dcos_dk[1]/sin_theta, 
+                                  -p_ijk->dcos_di[0]/sin_theta,
+                                  -p_ijk->dcos_di[1]/sin_theta,
+                                  -p_ijk->dcos_di[2]/sin_theta,
+                                  -p_ijk->dcos_dj[0]/sin_theta,
+                                  -p_ijk->dcos_dj[1]/sin_theta,
+                                  -p_ijk->dcos_dj[2]/sin_theta,
+                                  -p_ijk->dcos_dk[0]/sin_theta,
+                                  -p_ijk->dcos_dk[1]/sin_theta,
                                   -p_ijk->dcos_dk[2]/sin_theta );*/
 
-                                /* fprintf( out_control->epen, 
-                                   "%23.15e%23.15e%23.15e\n", 
+                                /* fprintf( out_control->epen,
+                                   "%23.15e%23.15e%23.15e\n",
                                    CEpen1, CEpen2, CEpen3 );
-                                   fprintf( out_control->epen, 
+                                   fprintf( out_control->epen,
                                    "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
                                    workspace->orig_id[i],  workspace->orig_id[j],
-                                   workspace->orig_id[k], RAD2DEG(theta), 
+                                   workspace->orig_id[k], RAD2DEG(theta),
                                    BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
 
-                                fprintf( out_control->ecoa, 
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                        workspace->orig_id[i], 
-                                        workspace->orig_id[j],
-                                        workspace->orig_id[k], 
-                                        RAD2DEG(theta), BOA_ij, BOA_jk, 
-                                        e_coa, data->E_Coa );
+                                fprintf( out_control->ecoa,
+                                         "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                         workspace->orig_id[i],
+                                         workspace->orig_id[j],
+                                         workspace->orig_id[k],
+                                         RAD2DEG(theta), BOA_ij, BOA_jk,
+                                         e_coa, data->E_Coa );
 #endif
 
 #ifdef TEST_FORCES            /* angle forces */
                                 Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
                                 Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-                                Add_dDelta( system, lists, 
-                                        j, CEval3 + CEval7, workspace->f_ang );
+                                Add_dDelta( system, lists,
+                                            j, CEval3 + CEval7, workspace->f_ang );
 
-                                for( t = start_j; t < end_j; ++t ) {
+                                for ( t = start_j; t < end_j; ++t )
+                                {
                                     pbond_jt = &( bond_list[t] );
                                     bo_jt = &(pbond_jt->bo_data);
                                     temp_bo_jt = bo_jt->BO;
                                     temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    pBOjt7 = temp * temp * temp_bo_jt;
 
                                     Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                                            workspace->f_ang );
-                                    Add_dBOpinpi2( system, lists, j, t, 
-                                            CEval5, CEval5, 
-                                            workspace->f_ang, workspace->f_ang );
+                                             workspace->f_ang );
+                                    Add_dBOpinpi2( system, lists, j, t,
+                                                   CEval5, CEval5,
+                                                   workspace->f_ang, workspace->f_ang );
                                 }
 
                                 rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
@@ -536,10 +549,10 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
                                 /* end penalty forces */
 
                                 /* coalition forces */
-                                Add_dBO( system, lists, 
-                                        j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-                                Add_dBO( system, lists, 
-                                        j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                                Add_dBO( system, lists,
+                                         j, pi, CEcoa1 - CEcoa4, workspace->f_coa );
+                                Add_dBO( system, lists,
+                                         j, pk, CEcoa2 - CEcoa5, workspace->f_coa );
                                 Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
                                 Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
                                 Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
@@ -555,32 +568,36 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
         }
     }
 
-    if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+
+    if ( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE )
+    {
         workspace->realloc.num_3body = num_thb_intrs;
-        if( num_thb_intrs > thb_intrs->num_intrs ) {
+        if ( num_thb_intrs > thb_intrs->num_intrs )
+        {
             fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-                    data->step, num_thb_intrs, thb_intrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
+                     data->step, num_thb_intrs, thb_intrs->num_intrs );
+            exit( INSUFFICIENT_MEMORY );
         }
     }
 
-    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n",
     // data->step, num_thb_intrs );
 #ifdef TEST_ENERGY
-    fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+    fprintf( stderr, "Number of angle interactions: %d\n", num_thb_intrs );
 
-    fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-            data->E_Ang, data->E_Pen, data->E_Coa );
+    fprintf( stderr, "Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+             data->E_Ang, data->E_Pen, data->E_Coa );
 
-    fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+    fprintf( stderr, "3body: ext_press (%23.15e %23.15e %23.15e)\n",
+             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
 
 
-void Hydrogen_Bonds( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+
+void Hydrogen_Bonds( reax_system *system, control_params *control,
+                     simulation_data *data, static_storage *workspace,
+                     list **lists, output_controls *out_control )
 {
     int i, j, k, pi, pk, itr, top;
     int type_i, type_j, type_k;
@@ -610,10 +627,11 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
        here j is H atom and there has to be some bond between i and j.
        Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map 
+       so in this function i->X, j->H, k->Z when we map
        variables onto the ones in the handout.*/
-    for( j = 0; j < system->N; ++j )
-        if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
+    for ( j = 0; j < system->N; ++j )
+        if ( system->reaxprm.sbp[system->atoms[j].type].p_hbond == 1 ) // j must be H
+        {
             /*set j's variables */
             type_j  = system->atoms[j].type;
             start_j = Start_Index(j, bonds);
@@ -622,21 +640,23 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
             hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
 
             top = 0;
-            for( pi = start_j; pi < end_j; ++pi ) {
+            for ( pi = start_j; pi < end_j; ++pi )
+            {
                 pbond_ij = &( bond_list[pi] );
                 i = pbond_ij->nbr;
                 bo_ij = &(pbond_ij->bo_data);
                 type_i = system->atoms[i].type;
 
-                if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
+                if ( system->reaxprm.sbp[type_i].p_hbond == 2 &&
                         bo_ij->BO >= HB_THRESHOLD )
                     hblist[top++] = pi;
             }
 
-            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
             //          j, top, hb_start_j, hb_end_j );
 
-            for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+            for ( pk = hb_start_j; pk < hb_end_j; ++pk )
+            {
                 /* set k's varibles */
                 k = hbond_list[pk].nbr;
                 type_k = system->atoms[k].type;
@@ -644,69 +664,59 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 r_jk = nbr_jk->d;
                 rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
 
-                for( itr=0; itr < top; ++itr ) {
+                for ( itr = 0; itr < top; ++itr )
+                {
                     pi = hblist[itr];
                     pbond_ij = &( bond_list[pi] );
                     i = pbond_ij->nbr;
 
-                    if( i != k ) {
+                    if ( i != k )
+                    {
                         bo_ij = &(pbond_ij->bo_data);
                         type_i = system->atoms[i].type;
-                        r_ij = pbond_ij->d;         
+                        r_ij = pbond_ij->d;
                         hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, system->reaxprm.num_atom_types) ]);
                         ++num_hb_intrs;
 
-                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &theta, &cos_theta );
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta );
                         /* the derivative of cos(theta) */
                         Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &dcos_theta_di, &dcos_theta_dj, 
-                                &dcos_theta_dk );
+                                &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
 
                         /* hydrogen bond energy*/
-                        sin_theta2 = SIN( theta/2.0 );
+                        sin_theta2 = SIN( theta / 2.0 );
                         sin_xhz4 = SQR(sin_theta2);
                         sin_xhz4 *= sin_xhz4;
                         cos_xhz1 = ( 1.0 - cos_theta );
                         exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-                                    r_jk / hbp->r0_hb - 2.0 ) );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk
+                                    + r_jk / hbp->r0_hb - 2.0 ) );
 
-                        data->E_HB += e_hb = 
-                            hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        data->E_HB += e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
 
-                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-                                1.0 / hbp->r0_hb);
+                        CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+                        CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb);
 
                         /* hydrogen bond forces */
                         bo_ij->Cdbo += CEhb1;   // dbo term
 
-                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-                            rvec_ScaledAdd( system->atoms[i].f, 
-                                    +CEhb2, dcos_theta_di ); //dcos terms
-                            rvec_ScaledAdd( system->atoms[j].f, 
-                                    +CEhb2, dcos_theta_dj );
-
-
-
-
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, 
-                                    +CEhb2, dcos_theta_dk );
-
+                        if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                        {
+                            rvec_ScaledAdd( system->atoms[i].f,
+                                            +CEhb2, dcos_theta_di ); //dcos terms
+                            rvec_ScaledAdd( system->atoms[j].f,
+                                            +CEhb2, dcos_theta_dj );
+                            rvec_ScaledAdd( system->atoms[k].f,
+                                            +CEhb2, dcos_theta_dk );
                             //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
-
-
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3 / r_jk, dvec_jk );
                         }
                         else
                         {
-                            /* for pressure coupling, terms that are not related 
-                               to bond order derivatives are added directly into 
+                            /* for pressure coupling, terms that are not related
+                               to bond order derivatives are added directly into
                                pressure vector/tensor */
                             rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
                             rvec_Add( system->atoms[i].f, force );
@@ -717,39 +727,32 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
 
                             ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
                             rvec_Scale( force, +CEhb2, dcos_theta_dk );
-
-
-
-                            //TODO
                             rvec_Add( system->atoms[k].f, force );
-
-
-
                             rvec_iMultiply( ext_press, rel_jk, force );
                             rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
                             //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
 
-                            rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                            rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
                             rvec_Add( system->atoms[k].f, force );
                             rvec_iMultiply( ext_press, rel_jk, force );
                             rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
                             /* This part is intended for a fully-flexible box */
-                            /* rvec_OuterProduct( temp_rtensor, 
+                            /* rvec_OuterProduct( temp_rtensor,
                                dcos_theta_di, system->atoms[i].x );
                                rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
 
                                rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
                                -CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
+                               rvec_OuterProduct( temp_rtensor,
                                temp_rvec, system->atoms[j].x );
                                rtensor_Add( total_rtensor, temp_rtensor );
 
                                rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
                                +CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
+                               rvec_OuterProduct( temp_rtensor,
                                temp_rvec, system->atoms[k].x );
                                rtensor_Add( total_rtensor, temp_rtensor );
 
@@ -760,38 +763,38 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                         }
 
 #ifdef TEST_ENERGY
-                        /*fprintf( out_control->ehb, 
+                        /*fprintf( out_control->ehb,
                           "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
-                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
+                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2],
+                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2],
                           dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
                           fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
                           CEhb1, CEhb2, CEhb3 ); */
-                        fprintf( stderr, //out_control->ehb, 
-                                "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                workspace->orig_id[i], 
-                                workspace->orig_id[j], 
-                                workspace->orig_id[k], 
-                                r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
+                        fprintf( stderr, //out_control->ehb,
+                                 "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                 workspace->orig_id[i],
+                                 workspace->orig_id[j],
+                                 workspace->orig_id[k],
+                                 r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
 
 #endif
 #ifdef TEST_FORCES
                         // dbo term
                         Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
                         // dcos terms
-                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
                         rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
                         rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
                         // dr terms
-                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3 / r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                 }
             }
         }
 
-    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
+    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 
 #ifdef TEST_FORCES
diff --git a/PuReMD-GPU/src/tool_box.c b/PuReMD-GPU/src/tool_box.c
new file mode 100644
index 0000000000000000000000000000000000000000..1782e71cdac028ddb5c0947ef371cf9259bfad2b
--- /dev/null
+++ b/PuReMD-GPU/src/tool_box.c
@@ -0,0 +1,467 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "tool_box.h"
+
+#include <ctype.h>
+
+
+/************** taken from box.c **************/
+void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
+{
+    int i, j;
+    real tmp;
+
+    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
+
+    if (flag > 0)
+    {
+        for (i = 0; i < 3; i++)
+        {
+            tmp = 0.0;
+            for (j = 0; j < 3; j++)
+                tmp += box->trans[i][j] * x1[j];
+            x2[i] = tmp;
+        }
+    }
+    else
+    {
+        for (i = 0; i < 3; i++)
+        {
+            tmp = 0.0;
+            for (j = 0; j < 3; j++)
+                tmp += box->trans_inv[i][j] * x1[j];
+            x2[i] = tmp;
+        }
+    }
+    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);
+}
+
+
+void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
+{
+    Transform( x1, box, flag, x2 );
+
+    x2[0] /= box->box_norms[0];
+    x2[1] /= box->box_norms[1];
+    x2[2] /= box->box_norms[2];
+}
+
+
+/* determine whether point p is inside the box */
+void Fit_to_Periodic_Box( simulation_box *box, rvec *p )
+{
+    int i;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        //TODO: verify box boundary coordinates -- assuming orthogonal box pinned at origin
+        if ( (*p)[i] < 0. )
+        {
+            /* handle lower coords */
+            while ( (*p)[i] < 0. )
+                (*p)[i] += box->box_norms[i];
+        }
+        else if ( (*p)[i] >= box->box_norms[i] )
+        {
+            /* handle higher coords */
+            while ( (*p)[i] >= box->box_norms[i] )
+                (*p)[i] -= box->box_norms[i];
+        }
+//        if ( (*p)[i] < box->min[i] )
+//        {
+//            /* handle lower coords */
+//            while ( (*p)[i] < box->min[i] )
+//                (*p)[i] += box->box_norms[i];
+//        }
+//        else if ( (*p)[i] >= box->max[i] )
+//        {
+//            /* handle higher coords */
+//            while ( (*p)[i] >= box->max[i] )
+//                (*p)[i] -= box->box_norms[i];
+//        }
+    }
+}
+
+
+/* determine the touch point, tp, of a box to
+   its neighbor denoted by the relative coordinate rl */
+/*
+inline void Box_Touch_Point( simulation_box *box, ivec rl, rvec tp )
+{
+    int d;
+
+    for ( d = 0; d < 3; ++d )
+        if ( rl[d] == -1 )
+            tp[d] = box->min[d];
+        else if ( rl[d] == 0 )
+            tp[d] = NEG_INF - 1.;
+        else
+            tp[d] = box->max[d];
+}
+*/
+
+
+/* determine whether point p is inside the box */
+/* assumes orthogonal box */
+/*
+inline int is_Inside_Box( simulation_box *box, rvec p )
+{
+    if ( p[0] < box->min[0] || p[0] >= box->max[0] ||
+            p[1] < box->min[1] || p[1] >= box->max[1] ||
+            p[2] < box->min[2] || p[2] >= box->max[2] )
+        return FALSE;
+
+    return TRUE;
+}
+*/
+
+
+/*
+inline int iown_midpoint( simulation_box *box, rvec p1, rvec p2 )
+{
+    rvec midp;
+
+    midp[0] = (p1[0] + p2[0]) / 2;
+    midp[1] = (p1[1] + p2[1]) / 2;
+    midp[2] = (p1[2] + p2[2]) / 2;
+
+    if ( midp[0] < box->min[0] || midp[0] >= box->max[0] ||
+            midp[1] < box->min[1] || midp[1] >= box->max[1] ||
+            midp[2] < box->min[2] || midp[2] >= box->max[2] )
+        return FALSE;
+
+    return TRUE;
+}
+*/
+
+
+/**************** from grid.c ****************/
+/* finds the closest point of grid cell cj to ci.
+   no need to consider periodic boundary conditions as in the serial case
+   because the box of a process is not periodic in itself */
+/*
+inline void GridCell_Closest_Point( grid_cell *gci, grid_cell *gcj,
+        ivec ci, ivec cj, rvec cp )
+{
+    int  d;
+
+    for ( d = 0; d < 3; d++ )
+        if ( cj[d] > ci[d] )
+            cp[d] = gcj->min[d];
+        else if ( cj[d] == ci[d] )
+            cp[d] = NEG_INF - 1.;
+        else
+            cp[d] = gcj->max[d];
+}
+
+
+inline void GridCell_to_Box_Points( grid_cell *gc, ivec rl, rvec cp, rvec fp )
+{
+    int d;
+
+    for ( d = 0; d < 3; ++d )
+        if ( rl[d] == -1 )
+        {
+            cp[d] = gc->min[d];
+            fp[d] = gc->max[d];
+        }
+        else if ( rl[d] == 0 )
+        {
+            cp[d] = fp[d] = NEG_INF - 1.;
+        }
+        else
+        {
+            cp[d] = gc->max[d];
+            fp[d] = gc->min[d];
+        }
+}
+
+
+inline real DistSqr_between_Special_Points( rvec sp1, rvec sp2 )
+{
+    int  i;
+    real d_sqr = 0;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        if ( sp1[i] > NEG_INF && sp2[i] > NEG_INF )
+        {
+            d_sqr += SQR( sp1[i] - sp2[i] );
+        }
+    }
+
+    return d_sqr;
+}
+
+
+inline real DistSqr_to_Special_Point( rvec cp, rvec x )
+{
+    int  i;
+    real d_sqr = 0;
+
+    for ( i = 0; i < 3; ++i )
+    {
+        if ( cp[i] > NEG_INF )
+        {
+            d_sqr += SQR( cp[i] - x[i] );
+        }
+    }
+
+    return d_sqr;
+}
+
+
+inline int Relative_Coord_Encoding( ivec c )
+{
+    return 9 * (c[0] + 1) + 3 * (c[1] + 1) + (c[2] + 1);
+}
+*/
+
+
+/************** from geo_tools.c *****************/
+void Make_Point( real x, real y, real z, rvec* p )
+{
+    (*p)[0] = x;
+    (*p)[1] = y;
+    (*p)[2] = z;
+}
+
+
+int is_Valid_Serial( static_storage *workspace, int serial )
+{
+    if( workspace->map_serials[ serial ] < 0 )
+    {
+        fprintf( stderr, "CONECT line includes invalid pdb serial number %d.\n", serial );
+        fprintf( stderr, "Please correct the input file.Terminating...\n" );
+        exit( INVALID_INPUT );
+    }
+
+    return TRUE;
+}
+
+
+int Check_Input_Range( int val, int lo, int hi, char *message )
+{
+    if ( val < lo || val > hi )
+    {
+        fprintf( stderr, "%s\nInput %d - Out of range %d-%d. Terminating...\n",
+                 message, val, lo, hi );
+        exit( INVALID_INPUT );
+    }
+
+    return SUCCESS;
+}
+
+
+void Trim_Spaces( char *element )
+{
+    int i, j;
+
+    for ( i = 0; element[i] == ' '; ++i ); // skip initial space chars
+
+    for ( j = i; j < (int)(strlen(element)) && element[j] != ' '; ++j )
+    {
+        element[j - i] = toupper( element[j] ); // make uppercase, offset to 0
+    }
+    element[j - i] = 0; // finalize the string
+}
+
+
+/************ from system_props.c *************/
+real Get_Time( )
+{
+    gettimeofday(&tim, NULL );
+    return ( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+}
+
+
+real Get_Timing_Info( real t_start )
+{
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    return (t_end - t_start);
+}
+
+
+void Update_Timing_Info( real *t_start, real *timing )
+{
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    *timing += (t_end - *t_start);
+    *t_start = t_end;
+}
+
+
+/*********** from io_tools.c **************/
+int Get_Atom_Type( reax_interaction *reax_param, char *s )
+{
+    int i;
+
+    for ( i = 0; i < reax_param->num_atom_types; ++i )
+    {
+        if ( !strcmp( reax_param->sbp[i].name, s ) )
+        {
+            return i;
+        }
+    }
+
+    fprintf( stderr, "Unknown atom type %s. Terminating...\n", s );
+    exit( UNKNOWN_ATOM_TYPE );
+
+    return FAILURE;
+}
+
+
+char *Get_Element( reax_system *system, int i )
+{
+    return &( system->reaxprm.sbp[system->atoms[i].type].name[0] );
+}
+
+
+char *Get_Atom_Name( reax_system *system, int i )
+{
+    return &(system->atoms[i].name[0]);
+}
+
+
+int Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
+{
+    int i;
+
+    if ( (*line = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    if ( (*backup = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    if ( (*tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS )) == NULL )
+    {
+        return FAILURE;
+    }
+
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        if ( ((*tokens)[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN)) == NULL )
+        {
+            return FAILURE;
+        }
+    }
+
+    return SUCCESS;
+}
+
+
+int Tokenize( char* s, char*** tok )
+{
+    char test[MAX_LINE];
+    char *sep = "\t \n!=";
+    char *word;
+    int count = 0;
+
+    strncpy( test, s, MAX_LINE );
+
+    for ( word = strtok(test, sep); word; word = strtok(NULL, sep) )
+    {
+        strncpy( (*tok)[count], word, MAX_LINE );
+        count++;
+    }
+
+    return count;
+}
+
+
+/***************** taken from lammps ************************/
+/* safe malloc */
+void *smalloc( long n, char *name )
+{
+    void *ptr;
+
+    if ( n <= 0 )
+    {
+        fprintf( stderr, "WARNING: trying to allocate %ld bytes for array %s. ",
+                 n, name );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    ptr = malloc( n );
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "ERROR: failed to allocate %ld bytes for array %s",
+                 n, name );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    return ptr;
+}
+
+
+/* safe calloc */
+void *scalloc( int n, int size, char *name )
+{
+    void *ptr;
+
+    if ( n <= 0 )
+    {
+        fprintf( stderr, "WARNING: trying to allocate %d elements for array %s. ",
+                 n, name );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    if ( size <= 0 )
+    {
+        fprintf( stderr, "WARNING: elements size for array %s is %d. ",
+                 name, size );
+        fprintf( stderr, "returning NULL.\n" );
+        return NULL;
+    }
+
+    ptr = calloc( n, size );
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "ERROR: failed to allocate %d bytes for array %s",
+                 n * size, name );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    return ptr;
+}
+
+
+/* safe free */
+void sfree( void *ptr, char *name )
+{
+    if ( ptr == NULL )
+    {
+        fprintf( stderr, "WARNING: trying to free the already NULL pointer %s!\n",
+                 name );
+        return;
+    }
+
+    free( ptr );
+    ptr = NULL;
+}
diff --git a/PuReMD-GPU/src/tool_box.h b/PuReMD-GPU/src/tool_box.h
new file mode 100644
index 0000000000000000000000000000000000000000..db97076149a5f5c8868d02d299a25581d3b5a934
--- /dev/null
+++ b/PuReMD-GPU/src/tool_box.h
@@ -0,0 +1,72 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __TOOL_BOX_H_
+#define __TOOL_BOX_H_
+
+#include "mytypes.h"
+
+struct timeval tim;
+real t_end;
+
+
+/* from box.h */
+void Transform( rvec, simulation_box*, char, rvec );
+void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
+void Fit_to_Periodic_Box( simulation_box*, rvec* );
+//void Box_Touch_Point( simulation_box*, ivec, rvec );
+//int  is_Inside_Box( simulation_box*, rvec );
+//int  iown_midpoint( simulation_box*, rvec, rvec );
+
+/* from grid.h */
+/*
+void GridCell_Closest_Point( grid_cell*, grid_cell*, ivec, ivec, rvec );
+void GridCell_to_Box_Points( grid_cell*, ivec, rvec, rvec );
+real DistSqr_between_Special_Points( rvec, rvec );
+real DistSqr_to_Special_Point( rvec, rvec );
+int Relative_Coord_Encoding( ivec );
+*/
+
+/* from geo_tools.h */
+void Make_Point( real, real, real, rvec* );
+int is_Valid_Serial( static_storage*, int );
+int Check_Input_Range( int, int, int, char* );
+void Trim_Spaces( char* );
+
+/* from system_props.h */
+real Get_Time( );
+real Get_Timing_Info( real );
+void Update_Timing_Info( real*, real* );
+
+/* from io_tools.h */
+int Get_Atom_Type( reax_interaction*, char* );
+char *Get_Element( reax_system*, int );
+char *Get_Atom_Name( reax_system*, int );
+int Allocate_Tokenizer_Space( char**, char**, char*** );
+int Tokenize( char*, char*** );
+
+/* from lammps */
+void *smalloc( long, char* );
+void *scalloc( int, int, char* );
+void sfree( void*, char* );
+
+
+#endif
diff --git a/PuReMD-GPU/src/traj.c b/PuReMD-GPU/src/traj.c
index 2844c370ee79702ed0c75d090afe545149aae185..f8852d5d2cc2e425b67f6ffa6871b52c2f755046 100644
--- a/PuReMD-GPU/src/traj.c
+++ b/PuReMD-GPU/src/traj.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -30,7 +31,8 @@
 /************************************************/
 /*      CUSTOM FORMAT ROUTINES                  */
 /************************************************/
-int Write_Custom_Header(reax_system *system, control_params *control, 
+
+int Write_Custom_Header(reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control)
 {
     int i, header_len, control_block_len, frame_format_len;
@@ -40,119 +42,129 @@ int Write_Custom_Header(reax_system *system, control_params *control,
     char atom_format[100], bond_format[100], angle_format[100];
 
     sprintf( control_block, CONTROL_BLOCK,
-            system->N,
-            control->restart,
-            control->restart_from,
-            control->random_vel,
-            out_control->restart_freq,
-            control->ensemble,
-            control->nsteps,
-            control->dt,
-            control->reposition_atoms,
-            control->restrict_bonds,
-            control->tabulate,
-            control->nbr_cut,
-            control->r_cut,
-            control->bg_cut,
-            control->bo_cut,
-            control->thb_cut,
-            control->hb_cut,
-            control->q_err,
-            control->T_init,
-            control->T_final,
-            control->Tau_T,
-            control->T_mode,
-            control->T_rate,
-            control->T_freq,
-            control->P[0], control->P[1], control->P[2], 
-            control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
-            control->compressibility,
-            control->press_mode,
-            control->remove_CoM_vel,
-            out_control->write_steps,
-            out_control->traj_compress,
-            out_control->traj_format,
-            out_control->atom_format,
-            out_control->bond_info,
-            out_control->angle_info,
-            out_control->energy_update_freq,
-            control->molec_anal,
-            control->freq_molec_anal );
-
-            control_block_len = strlen( control_block );
-
-
-            sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", 
-                    NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
-
-            atom_format[0] = OPT_NOATOM;
-            switch( out_control->atom_format )
-            {
-                case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
-                             break;
-                case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
-                          break;
-                case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
-                          break;
-                case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
-                            break;
-            }
-            strcat( frame_format, atom_format );
-
-            bond_format[0] = OPT_NOBOND;
-            if( out_control->bond_info == OPT_BOND_BASIC )
-                sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
-            else if( out_control->bond_info == OPT_BOND_FULL )
-                sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
-            strcat( frame_format, bond_format );
+             system->N,
+             control->restart,
+             control->restart_from,
+             control->random_vel,
+             out_control->restart_freq,
+             control->ensemble,
+             control->nsteps,
+             control->dt,
+             control->reposition_atoms,
+             control->restrict_bonds,
+             control->tabulate,
+             control->nbr_cut,
+             control->r_cut,
+             control->bg_cut,
+             control->bo_cut,
+             control->thb_cut,
+             control->hb_cut,
+             control->qeq_solver_q_err,
+             control->T_init,
+             control->T_final,
+             control->Tau_T,
+             control->T_mode,
+             control->T_rate,
+             control->T_freq,
+             control->P[0], control->P[1], control->P[2],
+             control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
+             control->compressibility,
+             control->press_mode,
+             control->remove_CoM_vel,
+             out_control->write_steps,
+             out_control->traj_compress,
+             out_control->traj_format,
+             out_control->atom_format,
+             out_control->bond_info,
+             out_control->angle_info,
+             out_control->energy_update_freq,
+             control->molec_anal,
+             control->freq_molec_anal );
+
+    control_block_len = strlen( control_block );
+
+    sprintf( frame_format, "Frame Format: %d\n%s\n%s\n",
+             NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
+
+    atom_format[0] = OPT_NOATOM;
+    switch ( out_control->atom_format )
+    {
+    case OPT_ATOM_BASIC:
+        sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
+        break;
+    case OPT_ATOM_wF:
+        sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
+        break;
+    case OPT_ATOM_wV:
+        sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
+        break;
+    case OPT_ATOM_FULL:
+        sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
+        break;
+    }
+    strcat( frame_format, atom_format );
 
-            angle_format[0] = OPT_NOANGLE;
-            if( out_control->angle_info == OPT_ANGLE_BASIC )
-                sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
-            strcat( frame_format, angle_format );
+    bond_format[0] = OPT_NOBOND;
+    if ( out_control->bond_info == OPT_BOND_BASIC )
+    {
+        sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
+    }
+    else if ( out_control->bond_info == OPT_BOND_FULL )
+    {
+        sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
+    }
+    strcat( frame_format, bond_format );
 
-            frame_format_len = strlen( frame_format );
+    angle_format[0] = OPT_NOANGLE;
+    if ( out_control->angle_info == OPT_ANGLE_BASIC )
+    {
+        sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
+    }
+    strcat( frame_format, angle_format );
 
+    frame_format_len = strlen( frame_format );
 
-            header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ 
-                (frame_format_len + SIZE_INFO_LEN2) + 
-                (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
+    header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2) +
+                 (frame_format_len + SIZE_INFO_LEN2) +
+                 (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
 
-            out_control->write( out_control->trj, HEADER_INIT, 
-                    header_len, HEADER_INIT_LEN, out_control->traj_title );
+    out_control->write( out_control->trj, HEADER_INIT,
+                        header_len, HEADER_INIT_LEN, out_control->traj_title );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2,
-                    control_block_len + (frame_format_len + SIZE_INFO_LEN2) + 
-                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-                    control_block_len );
-            out_control->write( out_control->trj, "%s", control_block );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        control_block_len + (frame_format_len + SIZE_INFO_LEN2) +
+                        (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2),
+                        control_block_len );
+    out_control->write( out_control->trj, "%s", control_block );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-                    frame_format_len + 
-                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-                    frame_format_len );
-            out_control->write( out_control->trj, "%s", frame_format );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        frame_format_len +
+                        (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2),
+                        frame_format_len );
+    out_control->write( out_control->trj, "%s", frame_format );
 
-            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-                    ATOM_MAPPING_LEN * system->N, 
-                    ATOM_MAPPING_LEN * system->N );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        ATOM_MAPPING_LEN * system->N,
+                        ATOM_MAPPING_LEN * system->N );
 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_MAPPING,  
-                        workspace->orig_id[i], 
-                        system->atoms[i].type, 
-                        system->atoms[i].name, 
-                        system->reaxprm.sbp[ system->atoms[i].type ].mass ); 
+    for ( i = 0; i < system->N; ++i )
+    {
+        out_control->write( out_control->trj, ATOM_MAPPING,
+                            workspace->orig_id[i],
+                            system->atoms[i].type,
+                            system->atoms[i].name,
+                            system->reaxprm.sbp[ system->atoms[i].type ].mass );
+    }
 
-            fflush( out_control->trj );
+    fflush( out_control->trj );
 
-            return 0;
+    return 0;
 }
 
 
-int Append_Custom_Frame( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+int Append_Custom_Frame( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i, j, pi, pk, pk_j;
     int write_atoms, write_bonds, write_angles;
@@ -166,278 +178,326 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
 
 
     /* IMPORTANT: This whole part will go to init_trj after finalized! */
-    switch( out_control->atom_format )
+    switch ( out_control->atom_format )
     {
-        case OPT_ATOM_BASIC: 
-            atom_line_len = ATOM_BASIC_LEN;
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_wF: 
-            atom_line_len = ATOM_wF_LEN; 
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_wV: 
-            atom_line_len = ATOM_wV_LEN; 
-            write_atoms = 1;
-            break;
-        case OPT_ATOM_FULL: 
-            atom_line_len = ATOM_FULL_LEN; 
-            write_atoms = 1;
-            break;
-        default: 
-            atom_line_len = 0;
-            write_atoms = 0;
+    case OPT_ATOM_BASIC:
+        atom_line_len = ATOM_BASIC_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_wF:
+        atom_line_len = ATOM_wF_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_wV:
+        atom_line_len = ATOM_wV_LEN;
+        write_atoms = 1;
+        break;
+    case OPT_ATOM_FULL:
+        atom_line_len = ATOM_FULL_LEN;
+        write_atoms = 1;
+        break;
+    default:
+        atom_line_len = 0;
+        write_atoms = 0;
     }
 
-
     /* bond preparations */
     bond_line_len = write_bonds = 0;
-    if( out_control->bond_info == OPT_BOND_BASIC )
+    if ( out_control->bond_info == OPT_BOND_BASIC )
     {
         bond_line_len = BOND_BASIC_LEN;
         write_bonds = 1;
     }
-    else if( out_control->bond_info == OPT_BOND_FULL )
+    else if ( out_control->bond_info == OPT_BOND_FULL )
     {
         bond_line_len = BOND_FULL_LEN;
         write_bonds = 1;
     }
 
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds);
+    fprintf( stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds );
 #endif
 
     num_bonds = 0;
-    if( write_bonds )
+    if ( write_bonds )
     {
-
 #ifdef __PRINT_CPU_RESULTS__
-        //fprintf (stderr, "Synching bonds from device for printing ....\n");
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Synching bonds from device for printing ....\n" );
+#endif
         Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
 #endif
 
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
                         bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     ++num_bonds;
+                }
+            }
+        }
     }
 
-
     /* angle preparations */
-    if( out_control->angle_info == OPT_ANGLE_BASIC )
+    if ( out_control->angle_info == OPT_ANGLE_BASIC )
     {
         angle_line_len = ANGLE_BASIC_LEN;
         write_angles = 1;
     }
-    else 
+    else
     {
         angle_line_len = 0;
         write_angles = 0;
     }
 
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
+    fprintf( stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
 #endif
 
     num_thb_intrs = 0;
-    if( write_angles ) {
-
+    if ( write_angles )
+    {
 #ifdef __PRINT_CPU_RESULTS__
-        //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Synching three bodies from deivce for printing ... \n" );
+#endif 
         Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
-        if ( !write_bonds) {
-            //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
+        if ( !write_bonds )
+        {
+#ifdef __DEBUG_CUDA__
+            fprintf( stderr, "Synching bonds for three bodies from device for printing ... \n" );
+#endif 
             Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
         }
 #endif 
 
-        for( j = 0; j < system->N; ++j )
-            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+        for ( j = 0; j < system->N; ++j )
+        {
+            for ( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+            {
+                if ( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut )
+                {
                     // physical j&i bond
-                    for( pk = Start_Index( pi, thb_intrs ); 
+                    for ( pk = Start_Index( pi, thb_intrs );
                             pk < End_Index( pi, thb_intrs ); ++pk )
-                        if( bonds->select.bond_list[pi].nbr < 
-                                thb_intrs->select.three_body_list[pk].thb ) {
+                    {
+                        if ( bonds->select.bond_list[pi].nbr <
+                                thb_intrs->select.three_body_list[pk].thb )
+                        {
                             // get k's pointer on j's bond list
                             pk_j = thb_intrs->select.three_body_list[pk].pthb;
 
-                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                            if ( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut )
                                 // physical j&k bond
                                 ++num_thb_intrs;
                         }
+                    }
+                }
+            }
+        }
     }
 
 
-
     /* get correct pressure */
-    if( control->ensemble == NPT || control->ensemble == sNPT )
+    if ( control->ensemble == NPT || control->ensemble == sNPT )
+    {
         P = data->flex_bar.P_scalar;
-    else  if( control->ensemble == iNPT )
+    }
+    else  if ( control->ensemble == iNPT )
+    {
         P = data->iso_bar.P;
-    else P = 0;
-
+    }
+    else
+    {
+        P = 0;
+    }
 
     /* calculate total frame length*/
     sprintf( buffer, FRAME_GLOBALS,
-            data->step, data->time, 
-            data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
-            P, system->box.volume,
-            system->box.box_norms[0], 
-            system->box.box_norms[1], 
-            system->box.box_norms[2],
-            90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
-            data->E_BE,
-            data->E_Ov,  data->E_Un,  data->E_Lp,
-            data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
-            data->E_Tor, data->E_Con, 
-            data->E_vdW, data->E_Ele, data->E_Pol );
+             data->step, data->time,
+             data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
+             P, system->box.volume,
+             system->box.box_norms[0],
+             system->box.box_norms[1],
+             system->box.box_norms[2],
+             90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
+             data->E_BE,
+             data->E_Ov,  data->E_Un,  data->E_Lp,
+             data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
+             data->E_Tor, data->E_Con,
+             data->E_vdW, data->E_Ele, data->E_Pol );
     frame_globals_len = strlen( buffer );
 
-    frame_len = frame_globals_len + 
-        write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
-        write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-        write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+    frame_len = frame_globals_len +
+                write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
+                write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+                write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
 
     /* write size info & frame globals */
-    out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-            frame_len, frame_globals_len );
+    out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                        frame_len, frame_globals_len );
     out_control->write( out_control->trj, "%s", buffer );
 
 
-    /* write size info & atom lines */  
-    if( write_atoms ) 
+    /* write size info & atom lines */
+    if ( write_atoms )
     {
         rest_of_frame_len = system->N * atom_line_len +
-            write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+                            write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+                            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
-        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-                rest_of_frame_len, system->N * atom_line_len, 
-                system->N );
+        out_control->write( out_control->trj, SIZE_INFO_LINE3,
+                            rest_of_frame_len, system->N * atom_line_len,
+                            system->N );
     }
 
-    switch( out_control->atom_format )
+    switch ( out_control->atom_format )
     {
-        case 4: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_BASIC, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].q );
-            break;
-        case 5:
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_wF, 
-                        workspace->orig_id[i],
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].f[0], 
-                        system->atoms[i].f[1], 
-                        system->atoms[i].f[2],
-                        system->atoms[i].q );
-            break;
-        case 6: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_wV, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].v[0], 
-                        system->atoms[i].v[1], 
-                        system->atoms[i].v[2],
-                        system->atoms[i].q );
-            break;
-        case 7: 
-            for( i = 0; i < system->N; ++i )
-                out_control->write( out_control->trj, ATOM_FULL, 
-                        workspace->orig_id[i], 
-                        system->atoms[i].x[0], 
-                        system->atoms[i].x[1], 
-                        system->atoms[i].x[2],
-                        system->atoms[i].v[0], 
-                        system->atoms[i].v[1], 
-                        system->atoms[i].v[2],
-                        system->atoms[i].f[0], 
-                        system->atoms[i].f[1], 
-                        system->atoms[i].f[2],
-                        system->atoms[i].q );
-            break;
+    case 4:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_BASIC,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].q );
+        break;
+    case 5:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_wF,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].f[0],
+                                system->atoms[i].f[1],
+                                system->atoms[i].f[2],
+                                system->atoms[i].q );
+        break;
+    case 6:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_wV,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].v[0],
+                                system->atoms[i].v[1],
+                                system->atoms[i].v[2],
+                                system->atoms[i].q );
+        break;
+    case 7:
+        for ( i = 0; i < system->N; ++i )
+            out_control->write( out_control->trj, ATOM_FULL,
+                                workspace->orig_id[i],
+                                system->atoms[i].x[0],
+                                system->atoms[i].x[1],
+                                system->atoms[i].x[2],
+                                system->atoms[i].v[0],
+                                system->atoms[i].v[1],
+                                system->atoms[i].v[2],
+                                system->atoms[i].f[0],
+                                system->atoms[i].f[1],
+                                system->atoms[i].f[2],
+                                system->atoms[i].q );
+        break;
     }
     fflush( out_control->trj );
 
 
     /* write size info & bond lines */
-    if( write_bonds )
+    if ( write_bonds )
     {
         rest_of_frame_len = num_bonds * bond_line_len +
-            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+                            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
 
-        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-                rest_of_frame_len, num_bonds * bond_line_len, 
-                num_bonds );
+        out_control->write( out_control->trj, SIZE_INFO_LINE3,
+                            rest_of_frame_len, num_bonds * bond_line_len,
+                            num_bonds );
     }
 
-    if( out_control->bond_info == 1 ) {
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
-                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+    if ( out_control->bond_info == 1 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     bo_ij = &( bonds->select.bond_list[j] );
-                    out_control->write( out_control->trj, BOND_BASIC, 
-                            workspace->orig_id[i], 
-                            workspace->orig_id[bo_ij->nbr], 
-                            bo_ij->d, bo_ij->bo_data.BO );
+                    out_control->write( out_control->trj, BOND_BASIC,
+                                        workspace->orig_id[i],
+                                        workspace->orig_id[bo_ij->nbr],
+                                        bo_ij->d, bo_ij->bo_data.BO );
                 }
+            }
+        }
     }
-    else if( out_control->bond_info == 2 ) {
-        for( i = 0; i < system->N; ++i )
-            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-                if( i < bonds->select.bond_list[j].nbr && 
-                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+    else if ( out_control->bond_info == 2 )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+            {
+                if ( i < bonds->select.bond_list[j].nbr &&
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                {
                     bo_ij = &( bonds->select.bond_list[j] );
-                    out_control->write( out_control->trj, BOND_FULL, 
-                            workspace->orig_id[i], 
-                            workspace->orig_id[bo_ij->nbr], 
-                            bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, 
-                            bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
+                    out_control->write( out_control->trj, BOND_FULL,
+                                        workspace->orig_id[i],
+                                        workspace->orig_id[bo_ij->nbr],
+                                        bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s,
+                                        bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
                 }
+            }
+        }
     }
 
     fflush( out_control->trj );
 
 
     /* write size info & angle lines */
-    if( out_control->angle_info ) {
+    if ( out_control->angle_info )
+    {
         out_control->write( out_control->trj, SIZE_INFO_LINE3,
-                num_thb_intrs * angle_line_len, 
-                num_thb_intrs * angle_line_len, num_thb_intrs );
+                            num_thb_intrs * angle_line_len,
+                            num_thb_intrs * angle_line_len, num_thb_intrs );
 
-        for( j = 0; j < system->N; ++j )
-            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+        for ( j = 0; j < system->N; ++j )
+        {
+            for ( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+            {
+                if ( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut )
+                {
                     // physical j&i bond
-                    for( pk = Start_Index( pi, thb_intrs ); 
+                    for ( pk = Start_Index( pi, thb_intrs );
                             pk < End_Index( pi, thb_intrs ); ++pk )
-                        if( bonds->select.bond_list[pi].nbr < 
-                                thb_intrs->select.three_body_list[pk].thb ) {
-                            pk_j = thb_intrs->select.three_body_list[pk].pthb; 
+                    {
+                        if ( bonds->select.bond_list[pi].nbr <
+                                thb_intrs->select.three_body_list[pk].thb )
+                        {
+                            pk_j = thb_intrs->select.three_body_list[pk].pthb;
                             // get k's pointer on j's bond list
 
-                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                            if ( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut )
+                            {
                                 // physical j&k bond
                                 out_control->write( out_control->trj, ANGLE_BASIC,
-                                        workspace->orig_id[bonds->select.bond_list[pi].nbr], 
-                                        workspace->orig_id[j], 
-                                        workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], 
-                                        RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
+                                                    workspace->orig_id[bonds->select.bond_list[pi].nbr],
+                                                    workspace->orig_id[j],
+                                                    workspace->orig_id[thb_intrs->select.three_body_list[pk].thb],
+                                                    RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
+                            }
                         }
+                    }
+                }
+            }
+        }
     }
 
     fflush( out_control->trj );
@@ -445,45 +505,47 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     return 0;
 }
 
-/*
-   void Read_Traj( output_controls *out_control, char *traj_name )
-   {
-   int skip_all, skip_part, n;
-   char size_buffer[50];
-// char read_buffer[2048];
 
-out_control->trj = (FILE *)gzopen( traj_name, "r" );
+void Read_Traj( output_controls *out_control, char *traj_name )
+{
+    int skip_all, skip_part, n;
+    char size_buffer[50];
 
-fprintf( stderr, "file opened!\n" );
+    out_control->trj = gzopen( traj_name, "r" );
 
-while( !gzeof( out_control->trj ) )
-{
-if( gzgets( out_control->trj, size_buffer, 50 ) == Z_NULL )
-break;
+    fprintf( stderr, "file opened!\n" );
+
+    while ( !gzeof( out_control->trj ) )
+    {
+        if ( gzgets( out_control->trj, size_buffer, 50 ) == Z_NULL )
+        {
+            break;
+        }
 
-fprintf( stderr, "read line\n" );
+        fprintf( stderr, "read line\n" );
 
-if( strlen( size_buffer ) >= SIZE_INFO_LEN3 )
-sscanf( size_buffer, "%d %d %d", &skip_all, &skip_part, &n );
-else
-sscanf( size_buffer, "%d %d", &skip_all, &skip_part );
+        if ( strlen( size_buffer ) >= SIZE_INFO_LEN3 )
+        {
+            sscanf( size_buffer, "%d %d %d", &skip_all, &skip_part, &n );
+        }
+        else
+        {
+            sscanf( size_buffer, "%d %d", &skip_all, &skip_part );
+        }
 
-fprintf( stderr, "%d %d\n", skip_all, skip_part );
+        fprintf( stderr, "%d %d\n", skip_all, skip_part );
 
-gzseek( out_control->trj, skip_part, SEEK_CUR );
-}
+        gzseek( out_control->trj, skip_part, SEEK_CUR );
+    }
 
-gzclose( out_control->trj );
+    gzclose( out_control->trj );
 }
- */
-
 
 
 /********************************************************/
 /************      XYZ FORMAT ROUTINES    ***************/
 /********************************************************/
-
-int Write_xyz_Header( reax_system *system, control_params *control, 
+int Write_xyz_Header( reax_system *system, control_params *control,
         static_storage* workspace, output_controls *out_control )
 {
     fflush( out_control->trj );
@@ -492,25 +554,27 @@ int Write_xyz_Header( reax_system *system, control_params *control,
 }
 
 
-int Append_xyz_Frame( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+int Append_xyz_Frame( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i;
 
     out_control->write( out_control->trj, "%d\n", system->N );
 
     out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n",
-            data->step,
-            data->E_Tot, data->E_Pot, 
-            E_CONV*data->E_Kin, data->therm.T );
+                        data->step,
+                        data->E_Tot, data->E_Pot,
+                        E_CONV * data->E_Kin, data->therm.T );
 
-    for( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N; ++i )
+    {
         out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n",
-                system->reaxprm.sbp[ system->atoms[i].type ].name,
-                system->atoms[i].x[0], 
-                system->atoms[i].x[1], 
-                system->atoms[i].x[2] );
+                            system->reaxprm.sbp[ system->atoms[i].type ].name,
+                            system->atoms[i].x[0],
+                            system->atoms[i].x[1],
+                            system->atoms[i].x[2] );
+    }
 
     fflush( out_control->trj );
 
diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h
index 35d92602eee7c2d0b5ee83889623df2cb2106c71..200f67711e60285f67f32ecb238f81e95d3f9b0d 100644
--- a/PuReMD-GPU/src/traj.h
+++ b/PuReMD-GPU/src/traj.h
@@ -25,6 +25,7 @@
 
 #include <zlib.h>
 
+
 #define BLOCK_MARK "REAX_BLOCK_MARK "
 #define BLOCK_MARK_LEN 16
 
@@ -74,11 +75,27 @@
 #define SIZE_INFO_LEN3 33
 
 
-enum ATOM_LINE_OPTS {OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5,
-                     OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7
-                    };
-enum BOND_LINE_OPTS {OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL};
-enum ANGLE_LINE_OPTS {OPT_NOANGLE, OPT_ANGLE_BASIC};
+enum ATOM_LINE_OPTS
+{
+    OPT_NOATOM = 0,
+    OPT_ATOM_BASIC = 4,
+    OPT_ATOM_wF = 5,
+    OPT_ATOM_wV = 6,
+    OPT_ATOM_FULL = 7,
+};
+
+enum BOND_LINE_OPTS
+{
+    OPT_NOBOND = 0,
+    OPT_BOND_BASIC = 1,
+    OPT_BOND_FULL = 2,
+};
+
+enum ANGLE_LINE_OPTS
+{
+    OPT_NOANGLE = 0,
+    OPT_ANGLE_BASIC = 1,
+};
 
 
 struct
@@ -143,10 +160,8 @@ int Skip_Next_Block( gzFile, int*);
   No. of torsion entries (int)
   Torsion info lines as per torsion format.
 */
-int Write_Custom_Header( reax_system*, control_params*,
-                         static_storage*, output_controls* );
-int Write_xyz_Header   ( reax_system*, control_params*,
-                         static_storage*, output_controls* );
+int Write_Custom_Header( reax_system*, control_params*, static_storage*, output_controls* );
+int Write_xyz_Header   ( reax_system*, control_params*, static_storage*, output_controls* );
 
 /*
   Write_Traj_Header( gzfile file,
@@ -168,7 +183,7 @@ char Write_Traj_Header( FILE*, int, char**, char**, control_params* );
           char** various flags);
 */
 int Push_Traj_Frame( /*gzfile*/ FILE*, reax_system*, control_params*,
-                                simulation_data*, static_storage*, list**, char** );
+        simulation_data*, static_storage*, list**, char** );
 
 /*
   Append_Traj_Frame( gzfile file,
@@ -180,11 +195,11 @@ int Push_Traj_Frame( /*gzfile*/ FILE*, reax_system*, control_params*,
                 char** various flags);
 */
 int Append_Custom_Frame( reax_system*, control_params*, simulation_data*,
-                         static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 int Append_xyz_Frame   ( reax_system*, control_params*, simulation_data*,
-                         static_storage*, list**, output_controls* );
-
+        static_storage*, list**, output_controls* );
 
 void Read_Traj( output_controls*, char * );
 
+
 #endif
diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c
index 2e7a6daf9039ea26c22b2fcfda5913e46255ad75..d5b53a05e607d043ba5f59f96f51a1065438c1d4 100644
--- a/PuReMD-GPU/src/two_body_interactions.c
+++ b/PuReMD-GPU/src/two_body_interactions.c
@@ -1,19 +1,20 @@
 /*----------------------------------------------------------------------
-  PuReMD-GPU - Reax Force Field Simulator
+  SerialReax - Reax Force Field Simulator
 
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
+  Copyright (2010) Purdue University
   Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
   Ananth Y Grama, ayg@cs.purdue.edu
 
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
+  published by the Free Software Foundation; either version 2 of
   the License, or (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   See the GNU General Public License for more details:
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
@@ -21,14 +22,14 @@
 #include "two_body_interactions.h"
 
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
-#include "index_utils.h"
 
 
-void Bond_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
+void Bond_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace,
         list **lists, output_controls *out_control )
 {
     int i, j, pj;
@@ -50,12 +51,14 @@ void Bond_Energy( reax_system *system, control_params *control,
     gp10 = system->reaxprm.gp.l[10];
     gp37 = (int) system->reaxprm.gp.l[37];
 
-    for( i=0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         start_i = Start_Index(i, bonds);
         end_i = End_Index(i, bonds);
         //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-        for( pj = start_i; pj < end_i; ++pj )
-            if( i < bonds->select.bond_list[pj].nbr ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+            if ( i < bonds->select.bond_list[pj].nbr )
+            {
                 /* set the pointers */
                 j = bonds->select.bond_list[pj].nbr;
                 type_i = system->atoms[i].type;
@@ -68,15 +71,12 @@ void Bond_Energy( reax_system *system, control_params *control,
                 /* calculate the constants */
                 pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
                 exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-                CEbo = -twbp->De_s * exp_be12 * 
-                    ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+                CEbo = -twbp->De_s * exp_be12 *
+                       ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
 
                 /* calculate the Bond Energy */
-                ebond = 
-                    -twbp->De_s * bo_ij->BO_s * exp_be12 
-                    -twbp->De_p * bo_ij->BO_pi 
-                    -twbp->De_pp * bo_ij->BO_pi2;
-
+                ebond = -twbp->De_s * bo_ij->BO_s * exp_be12
+                    - twbp->De_p * bo_ij->BO_pi - twbp->De_pp * bo_ij->BO_pi2;
                 data->E_BE += ebond;
 
                 /* calculate derivatives of Bond Orders */
@@ -85,34 +85,36 @@ void Bond_Energy( reax_system *system, control_params *control,
                 bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-                        workspace->orig_id[i], workspace->orig_id[j], 
-                        // i+1, j+1, 
-                        bo_ij->BO, ebond/*, data->E_BE*/ );
-                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-                   workspace->orig_id[i], workspace->orig_id[j], 
+                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n",
+                         workspace->orig_id[i], workspace->orig_id[j],
+                         // i+1, j+1,
+                         bo_ij->BO, ebond/*, data->E_BE*/ );
+                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n",
+                   workspace->orig_id[i], workspace->orig_id[j],
                    CEbo, -twbp->De_p, -twbp->De_pp );*/
 #endif
 #ifdef TEST_FORCES
                 Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-                Add_dBOpinpi2( system, lists, i, pj, 
-                        -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-                        workspace->f_be, workspace->f_be );
+                Add_dBOpinpi2( system, lists, i, pj,
+                               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp),
+                               workspace->f_be, workspace->f_be );
 #endif
 
                 /* Stabilisation terminal triple bond */
-                if( bo_ij->BO >= 1.00 ) {
-                    if( gp37 == 2 ||
-                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                if ( bo_ij->BO >= 1.00 )
+                {
+                    if ( gp37 == 2 ||
+                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) ||
+                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) )
+                    {
                         // ba = SQR(bo_ij->BO - 2.50);
                         exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
                         //oboa=abo(j1)-boa;
                         //obob=abo(j2)-boa;
-                        exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-                        exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                        exphua1 = EXP(-gp3 * (workspace->total_bond_order[i] - bo_ij->BO));
+                        exphub1 = EXP(-gp3 * (workspace->total_bond_order[j] - bo_ij->BO));
                         //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-                        exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                        exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
                         hulpov = 1.0 / (1.0 + 25.0 * exphuov);
 
                         estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
@@ -120,24 +122,22 @@ void Bond_Energy( reax_system *system, control_params *control,
                         //estrain(j2) = estrain(j2) + 0.50*estriph;
                         data->E_BE += estriph;
 
-                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-                            ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-                        decobdboua = -gp10 * exphu * hulpov * 
-                            (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                        decobdboub = -gp10 * exphu * hulpov * 
-                            (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
+                                   ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.50) );
+                        decobdboua = -gp10 * exphu * hulpov *
+                                     (gp3 * exphua1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
+                        decobdboub = -gp10 * exphu * hulpov *
+                                     (gp3 * exphub1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
 
                         bo_ij->Cdbo += decobdbo;
                         workspace->CdDelta[i] += decobdboua;
                         workspace->CdDelta[j] += decobdboub;
-                        //loop_j ++;
-                        //fprintf (stderr, "incrementing loopj %d \n", loop_j);
 #ifdef TEST_ENERGY
-                        fprintf( out_control->ebond, 
-                                "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                                workspace->orig_id[i], workspace->orig_id[j],
-                                //i+1, j+1, 
-                                estriph, decobdbo, decobdboua, decobdboub );
+                        fprintf( out_control->ebond,
+                                 "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                                 workspace->orig_id[i], workspace->orig_id[j],
+                                 //i+1, j+1,
+                                 estriph, decobdbo, decobdboua, decobdboub );
 #endif
 #ifdef TEST_FORCES
                         Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
@@ -151,9 +151,9 @@ void Bond_Energy( reax_system *system, control_params *control,
 }
 
 
-void vdW_Coulomb_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
+void vdW_Coulomb_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int  i, j, pj;
     int  start_i, end_i;
@@ -172,20 +172,22 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     p_vdW1 = system->reaxprm.gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
-    far_nbrs = (*lists) + FAR_NBRS; 
+    far_nbrs = (*lists) + FAR_NBRS;
     e_ele = 0;
     e_vdW = 0;
     e_core = 0;
     de_core = 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
         // fprintf( stderr, "i: %d, start: %d, end: %d\n",
         //     i, start_i, end_i );
 
-        for( pj = start_i; pj < end_i; ++pj )
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+            {
                 nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
                 j = nbr_pj->nbr;
                 r_ij = nbr_pj->d;
@@ -202,15 +204,16 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 Tap = Tap * r_ij + control->Tap1;
                 Tap = Tap * r_ij + control->Tap0;
 
-                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-                dTap = dTap * r_ij + 5*control->Tap5;
-                dTap = dTap * r_ij + 4*control->Tap4;
-                dTap = dTap * r_ij + 3*control->Tap3;
-                dTap = dTap * r_ij + 2*control->Tap2;
-                dTap += control->Tap1/r_ij;
+                dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+                dTap = dTap * r_ij + 5 * control->Tap5;
+                dTap = dTap * r_ij + 4 * control->Tap4;
+                dTap = dTap * r_ij + 3 * control->Tap3;
+                dTap = dTap * r_ij + 2 * control->Tap2;
+                dTap += control->Tap1 / r_ij;
 
                 /*vdWaals Calculations*/
-                if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
+                if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+                {
                     // shielding
                     powr_vdW1 = POW(r_ij, p_vdW1);
                     powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
@@ -219,35 +222,37 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                     exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
                     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    data->E_vdW += e_vdW =
+                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                        POW(r_ij, p_vdW1 - 2.0);
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                            POW(r_ij, p_vdW1 - 2.0);
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) * dfn13 );
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) -
+                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                         (exp1 - exp2) * dfn13 );
                 }
-                else{ // no shielding
+                else  // no shielding
+                {
                     exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
                     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    data->E_vdW += e_vdW =
+                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) );
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                         (exp1 - exp2) );
                 }
 
-                if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
+                if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
+                {
                     // innner wall
-                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
                     e_vdW += self_coef * Tap * e_core;
                     data->E_vdW += self_coef * Tap * e_core;
 
-                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    de_core = -(twbp->acore / twbp->rcore) * e_core;
                     CEvd += self_coef * ( dTap * e_core + Tap * de_core );
                 }
 
@@ -257,24 +262,26 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
                 tmp = Tap / dr3gamij_3;
                 //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-                data->E_Ele += e_ele = 
-                    self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+                data->E_Ele += e_ele =
+                                   self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
 
 
                 CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
-                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                         ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q*
                   ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
 
 
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    rvec_ScaledAdd( system->atoms[i].f, 
-                            -(CEvd+CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f, 
-                            +(CEvd+CEclmb), nbr_pj->dvec );
+                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                {
+                    rvec_ScaledAdd( system->atoms[i].f,
+                                    -(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f,
+                                    +(CEvd + CEclmb), nbr_pj->dvec );
                 }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
+                else   // NPT, iNPT or sNPT
+                {
+                    /* for pressure coupling, terms not related to bond order
                        derivatives are added directly into pressure vector/tensor */
                     rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
 
@@ -284,47 +291,47 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                     rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
                     rvec_Add( data->ext_press, ext_press );
 
-                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
+                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)",
                       i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
 
                       fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
 
-                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",        
+                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",
                       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
-                    /* This part is intended for a fully-flexible box */          
-                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
+                    /* This part is intended for a fully-flexible box */
+                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec,
                        system->atoms[i].x );
-                       rtensor_Scale( total_rtensor, 
+                       rtensor_Scale( total_rtensor,
                        F_C * -(CEvd + CEclmb), temp_rtensor );
-                       rvec_OuterProduct( temp_rtensor, 
+                       rvec_OuterProduct( temp_rtensor,
                        nbr_pj->dvec, system->atoms[j].x );
-                       rtensor_ScaledAdd( total_rtensor, 
+                       rtensor_ScaledAdd( total_rtensor,
                        F_C * +(CEvd + CEclmb), temp_rtensor );
 
                        if( nbr_pj->imaginary )
-                    // This is an external force due to an imaginary nbr
-                    rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                    else
-                    // This interaction is completely internal
-                    rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                       // This is an external force due to an imaginary nbr
+                       rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                       else
+                       // This interaction is completely internal
+                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
                 }
 
 #ifdef TEST_ENERGY
                 rvec_MakeZero( temp );
                 rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
                 fprintf( out_control->evdw,
-                        "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                        //i+1, j+1,
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
+                         "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                         //i+1, j+1,
+                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                         r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
 
                 fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
-                        e_ele/*, data->E_Ele*/ );
+                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                         r_ij, system->atoms[i].q, system->atoms[j].q,
+                         e_ele/*, data->E_Ele*/ );
 #endif
 #ifdef TEST_FORCES
                 rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
@@ -337,13 +344,13 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     // fclose( fout );
 
-    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
+    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n",
     // data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 }
 
 
-void LR_vdW_Coulomb( reax_system *system, control_params *control, 
-        int i, int j, real r_ij, LR_data *lr )
+void LR_vdW_Coulomb( reax_system *system, control_params *control,
+                     int i, int j, real r_ij, LR_data *lr )
 {
     real p_vdW1 = system->reaxprm.gp.l[28];
     real p_vdW1i = 1.0 / p_vdW1;
@@ -367,12 +374,12 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
     Tap = Tap * r_ij + control->Tap1;
     Tap = Tap * r_ij + control->Tap0;
 
-    dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-    dTap = dTap * r_ij + 5*control->Tap5;
-    dTap = dTap * r_ij + 4*control->Tap4;
-    dTap = dTap * r_ij + 3*control->Tap3;
-    dTap = dTap * r_ij + 2*control->Tap2;
-    dTap += control->Tap1/r_ij;
+    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+    dTap = dTap * r_ij + 5 * control->Tap5;
+    dTap = dTap * r_ij + 4 * control->Tap4;
+    dTap = dTap * r_ij + 3 * control->Tap3;
+    dTap = dTap * r_ij + 2 * control->Tap2;
+    dTap += control->Tap1 / r_ij;
 
 
     /* vdWaals calculations */
@@ -383,20 +390,21 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
     exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
     /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
-Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
-Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), 
-powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
+       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
+       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
+       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
 
     dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
 
-    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
-        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
+               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
 
     /*vdWaals Calculations*/
-    if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
-    { // shielding
+    if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+    {
+        // shielding
         powr_vdW1 = POW(r_ij, p_vdW1);
         powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
 
@@ -404,30 +412,32 @@ powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
         exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
         exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-            POW(r_ij, p_vdW1 - 2.0);
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                POW(r_ij, p_vdW1 - 2.0);
 
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
     }
-    else{ // no shielding
+    else  // no shielding
+    {
         exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
         exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
         lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
     }
 
-    if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
-    { // innner wall
-        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+    if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
+    {
+        // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
         lr->e_vdW += Tap * e_core;
 
-        de_core = -(twbp->acore/twbp->rcore) * e_core;
+        de_core = -(twbp->acore / twbp->rcore) * e_core;
         lr->CEvd += dTap * e_core + Tap * de_core;
     }
 
@@ -439,10 +449,10 @@ powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
     lr->H = EV_to_KCALpMOL * tmp;
     lr->e_ele = C_ele * tmp;
     /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
-Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
-i, system->atoms[i].type, j, system->atoms[j].type, 
-twbp->gamma, Tap, dr3gamij_3, 
-system->atoms[i].q, system->atoms[j].q ); */
+       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
+       i, system->atoms[i].type, j, system->atoms[j].type,
+       twbp->gamma, Tap, dr3gamij_3,
+       system->atoms[i].q, system->atoms[j].q ); */
 
     lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
     /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
@@ -454,10 +464,9 @@ system->atoms[i].q, system->atoms[j].q ); */
 }
 
 
-void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
-        simulation_data *data, 
-        static_storage *workspace, list **lists, 
-        output_controls *out_control )
+void Tabulated_vdW_Coulomb_Energy( reax_system *system,
+        control_params *control, simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i, j, pj, r, steps, update_freq, update_energies;
     int type_i, type_j, tmin, tmax;
@@ -474,13 +483,16 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
     update_freq = out_control->energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
 
-    for( i = 0; i < system->N; ++i ) {
+    for ( i = 0; i < system->N; ++i )
+    {
         type_i  = system->atoms[i].type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
 
-        for( pj = start_i; pj < end_i; ++pj ) 
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+            {
                 nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
                 j      = nbr_pj->nbr;
                 type_j = system->atoms[j].type;
@@ -488,43 +500,46 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 self_coef = (i == j) ? 0.5 : 1.0;
                 tmin  = MIN( type_i, type_j );
                 tmax  = MAX( type_i, type_j );
-                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
+                t = &( LR[ index_lr(tmin,tmax,system->reaxprm.num_atom_types) ] ); 
 
                 /* Cubic Spline Interpolation */
                 r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
+                if ( r == 0 )  ++r;
+                base = (real)(r + 1) * t->dx;
                 dif = r_ij - base;
                 //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
 
-                if( update_energies ) {
-                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                        t->vdW[r].a;
+                if ( update_energies )
+                {
+                    e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) * dif +
+                            t->vdW[r].a;
                     e_vdW *= self_coef;
 
-                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                        t->ele[r].a;
+                    e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                            t->ele[r].a;
                     e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
 
                     data->E_vdW += e_vdW;
                     data->E_Ele += e_ele;
-                }    
+                }
 
-                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                    t->CEvd[r].a;
+                CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif +
+                       t->CEvd[r].a;
                 CEvd *= self_coef;
                 //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
 
-                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                    t->CEclmb[r].a;
+                CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif +
+                         t->CEclmb[r].a;
                 CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
 
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                {
                     rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
                     rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
                 }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
+                else   // NPT, iNPT or sNPT
+                {
+                    /* for pressure coupling, terms not related to bond order
                        derivatives are added directly into pressure vector/tensor */
                     rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
                     rvec_ScaledAdd( system->atoms[i].f, -1., temp );
@@ -535,11 +550,11 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
 #ifdef TEST_ENERGY
                 fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j], 
+                        workspace->orig_id[i], workspace->orig_id[j],
                         r_ij, e_vdW, data->E_vdW );
-                fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                fprintf(out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
                         workspace->orig_id[i], workspace->orig_id[j],
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        r_ij, system->atoms[i].q, system->atoms[j].q,
                         e_ele, data->E_Ele );
 #endif
 #ifdef TEST_FORCES
@@ -549,23 +564,24 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
             }
+        }
     }
 }
 
 
 #if defined(OLD)
-    /* Linear extrapolation */
-    /*p     = (r_ij * t->inv_dx;
-      r     = (int) p;
-      prev  = &( t->y[r] );
-      next  = &( t->y[r+1] );
-
-      tmp    = p - r;
-      e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
-      CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
-
-      e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
-      e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
-      CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
-      CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
+/* Linear extrapolation */
+/*p     = (r_ij * t->inv_dx;
+  r     = (int) p;
+  prev  = &( t->y[r] );
+  next  = &( t->y[r+1] );
+
+  tmp    = p - r;
+  e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
+  CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
+
+  e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
+  e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
+  CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
+  CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
 #endif
diff --git a/PuReMD-GPU/src/vector.c b/PuReMD-GPU/src/vector.c
index 7cf06eb8e6cb1560b651b8b16a091f3a387cdb6c..e396344d173a6d5343faf9d675f48a8ea4e0ca04 100644
--- a/PuReMD-GPU/src/vector.c
+++ b/PuReMD-GPU/src/vector.c
@@ -21,53 +21,90 @@
 #include "vector.h"
 
 
-int Vector_isZero( real* v, int k )
+inline int Vector_isZero( const real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        if( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
+    unsigned int i;
 
-    return 1;
+    #pragma omp master
+    {
+        ret = TRUE;
+    }
+
+    #pragma omp barrier
+
+    #pragma omp for reduction(&&: ret) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        if ( FABS( v[i] ) > ALMOST_ZERO )
+        {
+            ret = FALSE;
+        }
+    }
+
+    return ret;
 }
 
 
-void Vector_MakeZero( real *v, int k )
+inline void Vector_MakeZero( real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        v[k] = 0;
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        v[i] = ZERO;
+    }
 }
 
 
-void Vector_Copy( real* dest, real* v, int k )
+inline void Vector_Copy( real * const dest, const real * const v, const unsigned int k )
 {
-    for( --k; k>=0; --k )
-        dest[k] = v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = v[i];
+    }
 }
 
 
-void Vector_Print( FILE *fout, char *vname, real *v, int k )
+void Vector_Print( FILE * const fout, const char * const vname, const real * const v,
+                   const unsigned int k )
 {
-    int i;
+    unsigned int i;
 
     fprintf( fout, "%s:\n", vname );
-    for( i = 0; i < k; ++i )
+    for ( i = 0; i < k; ++i )
+    {
         fprintf( fout, "%24.15e\n", v[i] );
+    }
     fprintf( fout, "\n" );
 }
 
 
-real Norm( real* v1, int k )
+inline real Norm( const real * const v1, const unsigned int k )
 {
-    real ret = 0;
+    unsigned int i;
+
+    #pragma omp master
+    {
+        ret2 = ZERO;
+    }
 
-    for( --k; k>=0; --k )
-        ret +=  SQR( v1[k] );
+    #pragma omp barrier
+
+    #pragma omp for reduction(+: ret2) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        ret2 +=  SQR( v1[i] );
+    }
 
-    return SQRT( ret );
+    return SQRT( ret2 );
 }
 
 
-void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
+inline void rvec_Sum( rvec ret, const rvec v1 , const rvec v2 )
 {
     ret[0] = v1[0] + v2[0];
     ret[1] = v1[1] + v2[1];
@@ -75,13 +112,14 @@ void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
 }
 
 
-real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
+inline real rvec_ScaledDot( const real c1, const rvec v1,
+        const real c2, const rvec v2 )
 {
-    return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
+    return (c1 * c2) * (v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]);
 }
 
 
-void rvec_Multiply( rvec r, rvec v1, rvec v2 )
+inline void rvec_Multiply( rvec r, const rvec v1, const rvec v2 )
 {
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
@@ -89,7 +127,7 @@ void rvec_Multiply( rvec r, rvec v1, rvec v2 )
 }
 
 
-void rvec_Divide( rvec r, rvec v1, rvec v2 )
+inline void rvec_Divide( rvec r, const rvec v1, const rvec v2 )
 {
     r[0] = v1[0] / v2[0];
     r[1] = v1[1] / v2[1];
@@ -97,7 +135,7 @@ void rvec_Divide( rvec r, rvec v1, rvec v2 )
 }
 
 
-void rvec_iDivide( rvec r, rvec v1, ivec v2 )
+inline void rvec_iDivide( rvec r, const rvec v1, const ivec v2 )
 {
     r[0] = v1[0] / v2[0];
     r[1] = v1[1] / v2[1];
@@ -105,7 +143,7 @@ void rvec_iDivide( rvec r, rvec v1, ivec v2 )
 }
 
 
-void rvec_Invert( rvec r, rvec v )
+inline void rvec_Invert( rvec r, const rvec v )
 {
     r[0] = 1. / v[0];
     r[1] = 1. / v[1];
@@ -113,154 +151,189 @@ void rvec_Invert( rvec r, rvec v )
 }
 
 
-void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
+inline void rvec_OuterProduct( rtensor r, const rvec v1, const rvec v2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             r[i][j] = v1[i] * v2[j];
+        }
+    }
 }
 
 
-
-int rvec_isZero( rvec v )
+inline int rvec_isZero( const rvec v )
 {
-    if( fabs(v[0]) > ALMOST_ZERO || 
-            fabs(v[1]) > ALMOST_ZERO || 
+    if ( fabs(v[0]) > ALMOST_ZERO ||
+            fabs(v[1]) > ALMOST_ZERO ||
             fabs(v[2]) > ALMOST_ZERO )
-        return 0;
-    return 1;
+    {
+        return FALSE;
+    }
+    return TRUE;
 }
 
 
-void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
+inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
 {
-    int i, j, k;
+    unsigned int i, j, k;
     rtensor temp;
 
     // check if the result matrix is the same as one of m1, m2.
-    // if so, we cannot modify the contents of m1 or m2, so 
+    // if so, we cannot modify the contents of m1 or m2, so
     // we have to use a temp matrix.
-    if( ret == m1 || ret == m2 )
+    if ( ret == m1 || ret == m2 )
     {
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
             {
-                temp[i][j] = 0;        
-                for( k = 0; k < 3; ++k )
+                temp[i][j] = 0;
+                for ( k = 0; k < 3; ++k )
                     temp[i][j] += m1[i][k] * m2[k][j];
             }
 
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
-                ret[i][j] = temp[i][j];    
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
+                ret[i][j] = temp[i][j];
     }
     else
     {
-        for( i = 0; i < 3; ++i )
-            for( j = 0; j < 3; ++j )
+        for ( i = 0; i < 3; ++i )
+            for ( j = 0; j < 3; ++j )
             {
-                ret[i][j] = 0;        
-                for( k = 0; k < 3; ++k )
+                ret[i][j] = 0;
+                for ( k = 0; k < 3; ++k )
                     ret[i][j] += m1[i][k] * m2[k][j];
             }
     }
 }
 
 
-void rtensor_MatVec( rvec ret, rtensor m, rvec v )
+inline void rtensor_MatVec( rvec ret, rtensor m, const rvec v )
 {
-    int i;
+    unsigned int i;
     rvec temp;
 
-    // if ret is the same vector as v, we cannot modify the 
+    // if ret is the same vector as v, we cannot modify the
     // contents of v until all computation is finished.
-    if( ret == v )
+    if ( ret == v )
     {
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
 
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             ret[i] = temp[i];
+        }
     }
     else
     {
-        for( i = 0; i < 3; ++i )
+        for ( i = 0; i < 3; ++i )
+        {
             ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
     }
 }
 
 
-void rtensor_Scale( rtensor ret, real c, rtensor m )
+inline void rtensor_Scale( rtensor ret, const real c, rtensor m )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c * m[i][j];
+        }
+    }
 }
 
 
-void rtensor_Add( rtensor ret, rtensor t )
+inline void rtensor_Add( rtensor ret, rtensor t )
 {
     int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += t[i][j];
+        }
+    }
 }
 
 
-void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
+inline void rtensor_ScaledAdd( rtensor ret, const real c, rtensor t )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] += c * t[i][j];
+        }
+    }
 }
 
 
-void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
+inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t1[i][j] + t2[i][j];
+        }
+    }
 }
 
 
-void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
-        real c2, rtensor t2 )
+inline void rtensor_ScaledSum( rtensor ret, const real c1, rtensor t1,
+                               const real c2, rtensor t2 )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
+        }
+    }
 }
 
 
-void rtensor_Copy( rtensor ret, rtensor t )
+inline void rtensor_Copy( rtensor ret, rtensor t )
 {
-    int i, j;
+    unsigned int i, j;
 
-    for( i = 0; i < 3; ++i )
-        for( j = 0; j < 3; ++j )
+    for ( i = 0; i < 3; ++i )
+    {
+        for ( j = 0; j < 3; ++j )
+        {
             ret[i][j] = t[i][j];
+        }
+    }
 }
 
 
-void rtensor_Identity( rtensor t )
+inline void rtensor_Identity( rtensor t )
 {
     t[0][0] = t[1][1] = t[2][2] = 1;
     t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO;
 }
 
 
-void rtensor_MakeZero( rtensor t )
+inline void rtensor_MakeZero( rtensor t )
 {
     t[0][0] = t[0][1] = t[0][2] = ZERO;
     t[1][0] = t[1][1] = t[1][2] = ZERO;
@@ -268,50 +341,58 @@ void rtensor_MakeZero( rtensor t )
 }
 
 
-void rtensor_Transpose( rtensor ret, rtensor t )
+inline void rtensor_Transpose( rtensor ret, rtensor t )
 {
-    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    ret[0][0] = t[0][0];
+    ret[1][1] = t[1][1];
+    ret[2][2] = t[2][2];
+
+    ret[0][1] = t[1][0];
+    ret[0][2] = t[2][0];
+
+    ret[1][0] = t[0][1];
+    ret[1][2] = t[2][1];
+
+    ret[2][0] = t[0][2];
+    ret[2][1] = t[1][2];
 }
 
 
-real rtensor_Det( rtensor t )
+inline real rtensor_Det( rtensor t )
 {
     return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }
 
 
-real rtensor_Trace( rtensor t )
+inline real rtensor_Trace( rtensor t )
 {
     return (t[0][0] + t[1][1] + t[2][2]);
 }
 
 
-void Print_rTensor(FILE* fp, rtensor t)
+void Print_rTensor(FILE * const fp, rtensor t)
 {
-    int i, j;
+    unsigned int i, j;
 
-    for (i=0; i < 3; i++)
+    for (i = 0; i < 3; i++)
     {
-        fprintf(fp,"[");
-        for (j=0; j < 3; j++)
-            fprintf(fp,"%8.3f,\t",t[i][j]);
-        fprintf(fp,"]\n");
+        fprintf(fp, "[");
+        for (j = 0; j < 3; j++)
+            fprintf(fp, "%8.3f,\t", t[i][j]);
+        fprintf(fp, "]\n");
     }
 }
 
 
-void ivec_MakeZero( ivec v )
+inline void ivec_MakeZero( ivec v )
 {
     v[0] = v[1] = v[2] = 0;
 }
 
 
-void ivec_rScale( ivec dest, real C, rvec src )
+inline void ivec_rScale( ivec dest, const real C, const rvec src )
 {
     dest[0] = (int)(C * src[0]);
     dest[1] = (int)(C * src[1]);
@@ -319,20 +400,22 @@ void ivec_rScale( ivec dest, real C, rvec src )
 }
 
 
-int ivec_isZero( ivec v )
+inline int ivec_isZero( const ivec v )
 {
-    if( v[0]==0 && v[1]==0 && v[2]==0 )
-        return 1;
-    return 0;
+    if ( v[0] == 0 && v[1] == 0 && v[2] == 0 )
+    {
+        return TRUE;
+    }
+    return FALSE;
 }
 
 
-int ivec_isEqual( ivec v1, ivec v2 )
+inline int ivec_isEqual( const ivec v1, const ivec v2 )
 {
-    if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
-        return 1;
+    if ( v1[0] == v2[0] && v1[1] == v2[1] && v1[2] == v2[2] )
+    {
+        return TRUE;
+    }
 
-    return 0;
+    return FALSE;
 }
-
-
diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h
index e1111e514928e79fc79197a0f2486d5eefb1cfa3..79748544fb8349bb797efe246e3430561fefef14 100644
--- a/PuReMD-GPU/src/vector.h
+++ b/PuReMD-GPU/src/vector.h
@@ -26,72 +26,85 @@
 #include "random.h"
 
 
+/* global to make OpenMP shared (Vector_isZero) */
+unsigned int ret;
+/* global to make OpenMP shared (Dot, Norm) */
+real ret2;
+
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-int  Vector_isZero( real*, int );
-void Vector_MakeZero( real*, int );
-void Vector_Copy( real*, real*, int );
-//void Vector_Scale( real*, real, real*, int );
-//void Vector_Sum( real*, real, real*, real, real*, int );
-//void Vector_Add( real*, real, real*, int );
-void Vector_Print( FILE*, char*, real*, int );
-real Norm( real*, int );
-
-void rvec_Sum( rvec, rvec, rvec );
-real rvec_ScaledDot( real, rvec, real, rvec );
-void rvec_Multiply( rvec, rvec, rvec );
-void rvec_Divide( rvec, rvec, rvec );
-void rvec_iDivide( rvec, rvec, ivec );
-void rvec_Invert( rvec, rvec );
-void rvec_OuterProduct( rtensor, rvec, rvec );
-int  rvec_isZero( rvec );
+int Vector_isZero( const real * const, const unsigned int );
+void Vector_MakeZero( real * const, const unsigned int );
+void Vector_Copy( real * const, const real * const, const unsigned int );
+void Vector_Print( FILE * const, const char * const, const real * const, const unsigned int );
+real Norm( const real * const, const unsigned int );
+
+void rvec_Sum( rvec, const rvec, const rvec );
+real rvec_ScaledDot( const real, const rvec, const real, const rvec );
+void rvec_Multiply( rvec, const rvec, const rvec );
+void rvec_Divide( rvec, const rvec, const rvec );
+void rvec_iDivide( rvec, const rvec, const ivec );
+void rvec_Invert( rvec, const rvec );
+void rvec_OuterProduct( rtensor, const rvec, const rvec );
+int rvec_isZero( const rvec );
 
 void rtensor_MakeZero( rtensor );
 void rtensor_Multiply( rtensor, rtensor, rtensor );
-void rtensor_MatVec( rvec, rtensor, rvec );
-void rtensor_Scale( rtensor, real, rtensor );
+void rtensor_MatVec( rvec, rtensor, const rvec );
+void rtensor_Scale( rtensor, const real, rtensor );
 void rtensor_Add( rtensor, rtensor );
-void rtensor_ScaledAdd( rtensor, real, rtensor );
+void rtensor_ScaledAdd( rtensor, const real, rtensor );
 void rtensor_Sum( rtensor, rtensor, rtensor );
-void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor );
-void rtensor_Scale( rtensor, real, rtensor );
+void rtensor_ScaledSum( rtensor, const real, rtensor, const real, rtensor );
+void rtensor_Scale( rtensor, const real, rtensor );
 void rtensor_Copy( rtensor, rtensor );
 void rtensor_Identity( rtensor );
 void rtensor_Transpose( rtensor, rtensor );
 real rtensor_Det( rtensor );
 real rtensor_Trace( rtensor );
 
-void Print_rTensor(FILE*, rtensor);
+void Print_rTensor(FILE * const, rtensor);
 
-int  ivec_isZero( ivec );
-int  ivec_isEqual( ivec, ivec );
+int ivec_isZero( const ivec );
+int ivec_isEqual( const ivec, const ivec );
 void ivec_MakeZero( ivec );
-void ivec_rScale( ivec, real, rvec );
+void ivec_rScale( ivec, const real, const rvec );
 
 
-static inline HOST_DEVICE real Dot( real* v1, real* v2, int k )
+static inline HOST_DEVICE real Dot( const real * const v1, const real * const v2, const unsigned int k )
 {
-    real ret = 0;
+    unsigned int i;
+
+    #pragma omp master
+    {
+        ret2 = ZERO;
+    }
+
+    #pragma omp barrier
 
-    for ( --k; k >= 0; --k )
-        ret +=  v1[k] * v2[k];
 
-    return ret;
+    #pragma omp for reduction(+: ret2) schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        ret2 += v1[i] * v2[i];
+    }
+
+    return ret2;
 }
 
 
-/////////////////////////////
-//rvec functions
-/////////////////////////////
 static inline HOST_DEVICE void rvec_MakeZero( rvec v )
 {
-    v[0] = v[1] = v[2] = ZERO;
+    v[0] = ZERO;
+    v[1] = ZERO;
+    v[2] = ZERO;
 }
 
 
-static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
+static inline HOST_DEVICE void rvec_Add( rvec ret, const rvec v )
 {
     ret[0] += v[0];
     ret[1] += v[1];
@@ -99,13 +112,15 @@ static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
 }
 
 
-static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src )
+static inline HOST_DEVICE void rvec_Copy( rvec dest, const rvec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
-static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_Cross( rvec ret, const rvec v1, const rvec v2 )
 {
     ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
     ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
@@ -113,13 +128,16 @@ static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
 }
 
 
-static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, const real c, const rvec v )
 {
-    ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
+    ret[0] += c * v[0];
+    ret[1] += c * v[1];
+    ret[2] += c * v[2];
 }
 
 
-static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
+static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, const real c1, const rvec v1,
+        const real c2, const rvec v2 )
 {
     ret[0] = c1 * v1[0] + c2 * v2[0];
     ret[1] = c1 * v1[1] + c2 * v2[1];
@@ -135,25 +153,27 @@ static inline HOST_DEVICE void rvec_Random( rvec v )
 }
 
 
-static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v )
+static inline HOST_DEVICE real rvec_Norm_Sqr( const rvec v )
 {
     return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
 }
 
 
-static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_Scale( rvec ret, const real c, const rvec v )
 {
-    ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
+    ret[0] = c * v[0];
+    ret[1] = c * v[1];
+    ret[2] = c * v[2];
 }
 
 
-static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 )
+static inline HOST_DEVICE real rvec_Dot( const rvec v1, const rvec v2 )
 {
     return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
 }
 
 
-static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_iMultiply( rvec r, const ivec v1, const rvec v2 )
 {
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
@@ -161,22 +181,21 @@ static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
 }
 
 
-static inline HOST_DEVICE real rvec_Norm( rvec v )
+static inline HOST_DEVICE real rvec_Norm( const rvec v )
 {
     return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
 }
 
 
-/////////////////
-//ivec functions
-/////////////////
-static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src )
+static inline HOST_DEVICE void ivec_Copy( ivec dest , const ivec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
-static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
+static inline HOST_DEVICE void ivec_Scale( ivec dest, const real C, const ivec src )
 {
     dest[0] = C * src[0];
     dest[1] = C * src[1];
@@ -184,7 +203,7 @@ static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
 }
 
 
-static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
+static inline HOST_DEVICE void ivec_Sum( ivec dest, const ivec v1, const ivec v2 )
 {
     dest[0] = v1[0] + v2[0];
     dest[1] = v1[1] + v2[1];
@@ -192,27 +211,43 @@ static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 }
 
 
-/////////////////
-//vector functions
-/////////////////
-static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+static inline HOST_DEVICE void Vector_Sum( real * const dest, const real c,
+        const real * const v, const real d, const real * const y,
+        const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] = c * v[k] + d * y[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i] + d * y[i];
+    }
 }
 
 
-static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Scale( real * const dest, const real c,
+        const real * const v, const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] = c * v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i];
+    }
 }
 
 
-static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Add( real * const dest, const real c,
+        const real * const v, const unsigned int k )
 {
-    for (k--; k >= 0; k--)
-        dest[k] += c * v[k];
+    unsigned int i;
+
+    #pragma omp for schedule(static)
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] += c * v[i];
+    }
 }
 
 #ifdef __cplusplus
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 55f2867ed31550f5b9e6f6e23872fdc43fa972c1..96c8397653e440d95feb25c9b074dc5b84de24d1 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -299,7 +299,7 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
                  workspace->id_all, workspace->rcounts, workspace->displs,
                  MPI_INT, MASTER_NODE, mpi_data->world );
 
-    free( id_list );
+    sfree( id_list, "id_list" );
 
 #if defined(DEBUG)
     if ( system->my_rank == MASTER_NODE )
diff --git a/PuReMD/src/control.c b/PuReMD/src/control.c
index 7ef81836e6e60791cb66acc45353e96c1def3b80..07b46d3a4d096f10d390ad203d13b14d151e36f9 100644
--- a/PuReMD/src/control.c
+++ b/PuReMD/src/control.c
@@ -439,9 +439,9 @@ char Read_Control_File( char *control_file, control_params* control,
 
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
     // fprintf( stderr,"%d %d %10.5f %d %10.5f %10.5f\n",
     //   control->ensemble, control->nsteps, control->dt,
diff --git a/PuReMD/src/ffield.c b/PuReMD/src/ffield.c
index 29138f5b4a15146f851fa7faee30ceaeb2a0c280..b05216bdcb19f8002bfe02293a3cf6e28dd4faa7 100644
--- a/PuReMD/src/ffield.c
+++ b/PuReMD/src/ffield.c
@@ -766,9 +766,9 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
 
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
 
     /* deallocate tor_flag */
@@ -777,12 +777,12 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         for ( j = 0; j < reax->num_atom_types; j++ )
         {
             for ( k = 0; k < reax->num_atom_types; k++ )
-                free( tor_flag[i][j][k] );
+                sfree( tor_flag[i][j][k], "tor_flag[i][j][k]" );
 
-            free( tor_flag[i][j] );
+            sfree( tor_flag[i][j], "tor_flag[i][j]" );
         }
 
-        free( tor_flag[i] );
+        sfree( tor_flag[i], "tor_flag[i]" );
     }
 
 
diff --git a/PuReMD/src/geo_tools.c b/PuReMD/src/geo_tools.c
index c037840bb15f962717c17363a50c38603f99b31d..dd5115c07649d3c7571ae36fc79cd221eb9ad0f1 100644
--- a/PuReMD/src/geo_tools.c
+++ b/PuReMD/src/geo_tools.c
@@ -623,8 +623,8 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
     }
     */
 
-    free(buffer);
-    free(line);
+    sfree(buffer, "buffer");
+    sfree(line, "line");
 
     return SUCCESS;
 }
diff --git a/PuReMD/src/grid.c b/PuReMD/src/grid.c
index 0881e00f97112084194c6d252fe2759b9f5d4ad4..0064f2201695b85625dcd15db75861fe7fcb80b9 100644
--- a/PuReMD/src/grid.c
+++ b/PuReMD/src/grid.c
@@ -515,7 +515,7 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace )
     }
 
     /* deallocate old storage */
-    free( system->my_atoms );
+    sfree( system->my_atoms, "system->my_atoms" );
     /* start using clustered storages */
     system->my_atoms = new_atoms;
     system->n = top;
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 897552ec19485e9b0e0108a42c40f1fe6c00aff7..b35a46887e638424b5d570841cd12e63c00462f9 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -675,8 +675,8 @@ int  Init_Lists( reax_system *system, control_params *control,
              bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
-    free( hb_top );
-    free( bond_top );
+    sfree( hb_top, "hb_top" );
+    sfree( bond_top, "bond_top" );
 
     return SUCCESS;
 }
@@ -779,8 +779,8 @@ int  Init_Lists( reax_system *system, control_params *control,
              bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
-    free( hb_top );
-    free( bond_top );
+    sfree( hb_top, "hb_top" );
+    sfree( bond_top, "bond_top" );
 
     return SUCCESS;
 }
diff --git a/PuReMD/src/lookup.c b/PuReMD/src/lookup.c
index e821db0390ecb1fa3b8b8f04118f83d55a2d82a8..2a38eb23d04d2ac3d08c4984a34f8d18445a8f6b 100644
--- a/PuReMD/src/lookup.c
+++ b/PuReMD/src/lookup.c
@@ -304,12 +304,12 @@ int Init_Lookup_Tables( reax_system *system, control_params *control,
                                           comm );
                 }
 
-    free(h);
-    free(fh);
-    free(fvdw);
-    free(fCEvd);
-    free(fele);
-    free(fCEclmb);
+    sfree(h, "h");
+    sfree(fh, "fh");
+    sfree(fvdw, "fvdw");
+    sfree(fCEvd, "fCEvd");
+    sfree(fele, "fele");
+    sfree(fCEclmb, "cCEclmb");
 
     return 1;
 }
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index 703a434c6bc5e9299b129db60457d3638c637ab5..6d0955c70c4dfd5813a2a29ae3be191996270dea 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -233,13 +233,13 @@ int main( int argc, char* argv[] )
     MPI_Finalize();
 
     /* de-allocate data structures */
-    free( system );
-    free( control );
-    free( data );
-    free( workspace );
-    free( lists );
-    free( out_control );
-    free( mpi_data );
+    sfree( system, "system" );
+    sfree( control, "control" );
+    sfree( data, "data" );
+    sfree( workspace, "workspace" );
+    sfree( lists, "lists" );
+    sfree( out_control, "out_control" );
+    sfree( mpi_data, "mpi_data" );
 
 #if defined(TEST_ENERGY) || defined(TEST_FORCES)
 //  Integrate_Results(control);
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index cba4f83df6594f225160c25dd7a5455d46977982..ed5c276b50244cecf7d69c977df67cbb074677d2 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -355,7 +355,7 @@ void Calculate_Charges( reax_system *system, storage *workspace,
     for ( i = system->n; i < system->N; ++i )
         system->my_atoms[i].q = q[i];
 
-    free(q);
+    sfree(q, "q");
 }
 
 
diff --git a/PuReMD/src/restart.c b/PuReMD/src/restart.c
index 4b9fa85343b2a84f69cfe76fd7e74544f3065254..b687d82abc0a2618b0cc7d496f39b8630d51d450 100644
--- a/PuReMD/src/restart.c
+++ b/PuReMD/src/restart.c
@@ -111,7 +111,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
         fclose( fres );
     }
 
-    free(buffer);
+    sfree(buffer, "buffer");
 }
 
 
@@ -206,8 +206,8 @@ void Write_Restart( reax_system *system, control_params *control,
         fprintf( fres, "%s", buffer );
         fclose( fres );
     }
-    free(buffer);
-    free(line);
+    sfree(buffer, "buffer");
+    sfree(line, "line");
 }
 
 
@@ -467,9 +467,9 @@ void Read_Restart( char *res_file, reax_system *system,
     fclose( fres );
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
     data->step = data->prev_steps;
     // nsteps is updated based on the number of steps in the previous run
diff --git a/PuReMD/src/tool_box.c b/PuReMD/src/tool_box.c
index 02d9e3d9d6d083f3873b61d932f31b78964eb66e..a089287001279e198fccbb9bbfd0aa660863c2d3 100644
--- a/PuReMD/src/tool_box.c
+++ b/PuReMD/src/tool_box.c
@@ -48,7 +48,7 @@ int SumScan( int n, int me, int root, MPI_Comm comm )
 
         MPI_Scatter( nbuf, 1, MPI_INT, &my_order, 1, MPI_INT, root, comm );
 
-        free( nbuf );
+        sfree( nbuf, "nbuf" );
     }
     else
     {
diff --git a/PuReMD/src/traj.c b/PuReMD/src/traj.c
index ab321a2ed04afeecf23f3b4dcec2ee38123fad0f..e455d74b3accf3d925a7df81a2c387e0983d288f 100644
--- a/PuReMD/src/traj.c
+++ b/PuReMD/src/traj.c
@@ -75,7 +75,7 @@ int Reallocate_Output_Buffer( output_controls *out_control, int req_space,
                               MPI_Comm comm )
 {
     if ( out_control->buffer_len > 0 )
-        free( out_control->buffer );
+        sfree( out_control->buffer, "out_control->buffer" );
 
     out_control->buffer_len = (int)(req_space * SAFE_ZONE);
     out_control->buffer = (char*) malloc(out_control->buffer_len * sizeof(char));
@@ -1122,8 +1122,8 @@ int End_Traj( int my_rank, output_controls *out_control )
         fclose( out_control->strj );
 #endif
 
-    free( out_control->buffer );
-    free( out_control->line );
+    sfree( out_control->buffer, "out_control->buffer" );
+    sfree( out_control->line, "out_control->line" );
 
     return SUCCESS;
 }
diff --git a/README.md b/README.md
index 442146a67a8a5c20732075c6daf7c541aa72e5f6..abdadbab72c5d297de9a8559471a036ee229c001 100644
--- a/README.md
+++ b/README.md
@@ -7,5 +7,5 @@ Files from the [Purdue Reactive Molecular Dynamics](https://www.cs.purdue.edu/pu
 Roughly by target platform
 - [Serial](https://www.cs.purdue.edu/puremd/docs/80859.pdf)
 - [MPI (message passing interface)](https://www.cs.purdue.edu/puremd/docs/Parallel-Reactive-Molecular-Dynamics.pdf)
-- [CUDA (single GPU)](http://dx.doi.org/10.1016/j.jcp.2014.04.035) (single GPU)
+- [CUDA (single GPU)](http://dx.doi.org/10.1016/j.jcp.2014.04.035)
 - [CUDA+MPI (multi-GPU)](https://www.cs.purdue.edu/puremd/docs/pgpuremd.pdf)
diff --git a/configure.ac b/configure.ac
index 2488af52296535e6ec37ca3c3d687e7f905d78a7..8b38548f4ab4e67a170ad2a3974bde4f949eb191 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,8 +3,7 @@
 
 AC_PREREQ([2.69])
 
-AC_INIT([Purdue Molecular Dynamics Suite], [1.0], [ohearnku@msu.edu hma@msu.edu])
-: ${CFLAGS=""}
+AC_INIT([PuReMD], [1.0], [ohearnku@msu.edu hma@msu.edu])
 AM_INIT_AUTOMAKE([1.15 -Wall -Werror])
 # Enable silent build rules by default.
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])], [AC_SUBST([AM_DEFAULT_VERBOSITY],[1])])
@@ -46,23 +45,25 @@ AC_ARG_ENABLE([mpi-gpu],
 			      [enable MPI+CUDA (multi GPU) support @<:@default: no@:>@])],
 	      [pack_mpi_gpu_enabled=${enableval}], [pack_mpi_gpu_enabled=no])
 
-if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes"; then
+if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes" || test "x${pack_gpu_enabled}" = "xyes"; then
 	AC_CONFIG_SUBDIRS([sPuReMD])
 	if test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" != "xyes"; then
 		export BUILD_OPENMP="no"
 	else
-		export BUILD_OPENMP="yes"
+		if test "x${pack_gpu_enabled}" = "xyes"; then
+			export BUILD_GPU="yes"
+		else
+			export BUILD_OPENMP="yes"
+		fi
 	fi
 fi
 AM_CONDITIONAL([BUILD_S_OMP], [test "x${pack_serial_enabled}" = "xyes" || test "x${pack_openmp_enabled}" = "xyes"])
+AM_CONDITIONAL([BUILD_GPU], [test "x${pack_gpu_enabled}" = "xyes"])
+
 if test "x${pack_mpi_enabled}" = "xyes"; then
 	AC_CONFIG_SUBDIRS([PuReMD])
 fi
 AM_CONDITIONAL([BUILD_MPI], [test "x${pack_mpi_enabled}" = "xyes"])
-if test "x${pack_gpu_enabled}" = "xyes"; then
-	AC_CONFIG_SUBDIRS([PuReMD-GPU])
-fi
-AM_CONDITIONAL([BUILD_GPU], [test "x${pack_gpu_enabled}" = "xyes"])
 if test "x${pack_mpi_not_gpu_enabled}" = "xyes" || test "x${pack_mpi_gpu_enabled}" = "xyes"; then
 	AC_CONFIG_SUBDIRS([PG-PuReMD])
 	if test "x${pack_mpi_not_gpu_enabled}" = "xyes" || test "x${pack_mpi_gpu_enabled}" != "xyes"; then
@@ -117,7 +118,7 @@ AC_ARG_ENABLE([timing],
 		[enable timing measurements and reporting @<:@default: no@:>@])],
 	[timing_enabled=${enableval}], [timing_enabled=no]
 )
-if test "x$timing_enabled" = "xyes"
+if test "x${timing_enabled}" = "xyes"
 then
 	export BUILD_TIMING="yes"
 fi
@@ -127,7 +128,7 @@ AC_ARG_WITH([superlu-mt],
 			    [enable usage of SuperLU MT for QEq preconditioner computation @<:@default: no@:>@])],
             [package_superlu_mt=${withval}], [package_superlu_mt=no])
 
-if test "x$package_superlu_mt" != "xno"
+if test "x${package_superlu_mt}" != "xno"
 then
 	export BUILD_SUPERLU_MT="${package_superlu_mt}"
 fi
diff --git a/cuda.am b/cuda.am
index e5e8600323bd82ca4c11bfa60e1f613309aea634..eb61fb31d9cc6fa715f63ba5f8e57c505b723fae 100644
--- a/cuda.am
+++ b/cuda.am
@@ -7,12 +7,8 @@
 
 AM_V_NVCC = $(AM_V_NVCC_@AM_V@)
 AM_V_NVCC_ = $(AM_V_NVCC_@AM_DEFAULT_V@)
-AM_V_NVCC_0 = @echo "  NVCC" $@;
-
-# these are default values for the maximun register count parameter
-# passed to nvcc compiler (you might need to change it sometimes; all you need
-# is to set it as an environment variable).
-MAX_REG_COUNT ?=48
+AM_V_NVCC_0 = @echo "  NVCC    " $@;
+AM_V_NVCC_1 =
 
 .cu.o:
-	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -maxrregcount=$(MAX_REG_COUNT) -o $@ -c $<
+	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -o $@ -c $<
diff --git a/data/benchmarks/water/ffield_Achtyl b/data/benchmarks/water/ffield_Achtyl
new file mode 100644
index 0000000000000000000000000000000000000000..ce54cb3bad2a701ff11a41f08679d98026a022d9
--- /dev/null
+++ b/data/benchmarks/water/ffield_Achtyl
@@ -0,0 +1,424 @@
+Reactive MD-force field: water/C/Ca/Ti AKS2 July 18; water from Feb 29          
+ 39       ! Number of general parameters                                        
+   50.0000 !Overcoordination parameter                                          
+    9.5469 !Overcoordination parameter                                          
+    1.6725 !Valency angle conjugation parameter                                 
+    1.7224 !Triple bond stabilisation parameter                                 
+    6.8702 !Triple bond stabilisation parameter                                 
+   54.6742 !C2-correction                                                       
+    1.0588 !Undercoordination parameter                                         
+    4.6000 !Triple bond stabilisation parameter                                 
+   12.1176 !Undercoordination parameter                                         
+   13.3056 !Undercoordination parameter                                         
+  -34.5448 !Triple bond stabilization energy                                    
+    0.0000 !Lower Taper-radius                                                  
+   10.0000 !Upper Taper-radius                                                  
+    2.8793 !Not used                                                            
+   33.8667 !Valency undercoordination                                           
+    6.0891 !Valency angle/lone pair parameter                                   
+    1.0563 !Valency angle                                                       
+    2.0384 !Valency angle parameter                                             
+    6.1431 !Not used                                                            
+    6.9290 !Double bond/angle parameter                                         
+    0.3989 !Double bond/angle parameter: overcoord                              
+    3.9954 !Double bond/angle parameter: overcoord                              
+   -2.4837 !Not used                                                            
+    8.5385 !Torsion/BO parameter                                                
+    6.7491 !Torsion overcoordination                                            
+    0.1414 !Torsion overcoordination                                            
+   -1.2327 !Conjugation 0 (not used)                                            
+    1.1348 !Conjugation                                                         
+    1.5591 !vdWaals shielding                                                   
+    0.1000 !Cutoff for bond order (*100)                                        
+    1.7602 !Valency angle conjugation parameter                                 
+    0.6991 !Overcoordination parameter                                          
+   50.0000 !Overcoordination parameter                                          
+    1.8512 !Valency/lone pair parameter                                         
+  548.6451 !Softness                                                            
+    0.0000 !Cutoff                                                              
+    5.0000 !Molecular energy (not used)                                         
+    0.0000 !Molecular energy (not used)                                         
+    0.7903 !Valency angle conjugation parameter                                 
+ 13   ! Nr of atoms; cov.r; valency;a.m;Rvdw;Evdw;gammaEEM;cov.r2;#             
+            alfa;gammavdW;valency;Eunder;Eover;chiEEM;etaEEM;n.u.               
+            cov r3;Elp;Heat inc.;n.u.;n.u.;n.u.;n.u.                            
+            ov/un;val1;n.u.;val3,vval4                                          
+ C    1.3417   4.0000  12.0000   2.0464   0.1064   0.4670   1.1681   4.0000     
+      9.0000   1.5000   4.0000  40.0000  79.5548   5.3422   4.5000   0.0000     
+      1.1437   0.0000 181.0000   5.4236  19.2788  13.5366   3.1838   0.0000     
+     -4.0998   4.8750   1.0564   4.0000   2.9663   1.2500   0.4000  21.3612     
+ H    0.9102   1.0000   1.0080   1.0996   0.1000   0.6683  -0.1000   1.0000     
+      9.2193   5.5154   1.0000   0.0000 121.1250   4.9673   6.2079   0.0000     
+     -0.1000   0.0000  55.5000   1.1004   6.8959   0.0003   3.4114   0.0000     
+     -6.5532   3.5000   1.0338   1.0000   2.8793   0.5000   0.1000   1.0000     
+ O    1.2218   2.0000  15.9990   2.2033   0.2841   0.9750   1.0727   6.0000     
+      8.8250 200.0000   4.0000  37.5000 116.0768   8.5000   7.9071   0.0000     
+      0.9049   0.5488  68.0152   2.1943   2.3055   0.0021   5.4479   0.0000     
+     -6.0011   3.6068   1.0493   4.0000   2.9225   0.5000   0.1000   1.0000     
+ N    1.2333   3.0000  14.0000   2.2375   0.1447   1.0000   1.1748   5.0000     
+      9.8626  12.6599   4.0000  30.3181 100.0000   6.0111   6.7037   0.0000     
+      1.0433   0.7872 119.9837   0.7425   6.7920   2.7271   2.2882   0.0000     
+     -2.0000   4.0000   1.0183   4.0000   2.8793   1.0000   0.1000   1.0000     
+ S    1.9405   2.0000  32.0600   2.0677   0.2099   1.0336   1.5479   6.0000     
+      9.9575   4.9055   4.0000  52.9998 112.1416   6.5000   8.2545   0.0000     
+      1.4601   9.7177  71.1843   5.7487  23.2859  12.7147   2.2882   0.0000     
+    -11.0000   2.7466   1.0338   6.2998   2.8793   1.0000   0.1000  10.0000     
+ Mg   1.8315   2.0000  24.3050   2.2464   0.1806   0.5020   1.0000   2.0000     
+     10.9186  27.1205   3.0000  38.0000   0.0000   0.9499   5.6130   0.0000     
+     -1.3000   0.0000 127.9160  49.9248   0.3370   0.0000   2.2882   0.0000     
+     -1.0823   2.3663   1.0564   6.0000   2.9663   1.0000   0.1000  10.0000     
+ P    1.5994   3.0000  30.9738   1.7000   0.1743   1.0000   1.3000   5.0000     
+      9.1909  14.9482   5.0000   0.0000   0.0000   1.6676   7.0946   0.0000     
+     -1.0000  25.0000 125.6300   0.2187  21.4305  15.1425   2.2882   0.0000     
+     -3.9294   3.4831   1.0338   5.0000   2.8793   1.0000   0.1000  10.0000     
+ Na   1.8000   1.0000  22.9898   2.8270   0.1872   0.4000  -1.0000   1.0000     
+     10.0000   2.5000   1.0000   0.0000   0.0000  -0.9871   6.7728   0.0000     
+     -1.0000   0.0000  23.0445 100.0000   1.0000   0.0000   2.2882   0.0000     
+     -2.5000   3.9900   1.0338   8.0000   2.5791   1.0000   0.1000  10.0000     
+ Ti   2.0254   4.0000  47.8800   2.2105   0.1574   0.6311   0.1000   4.0000     
+     12.7041  16.6482   4.0000   0.1000   0.0000  -1.3647   6.8406   0.0000     
+     -1.0000   0.0000 143.1770  27.6505  -0.0753   0.0064   2.2882   0.0000     
+    -15.0000   3.8359   1.0338  12.0000   2.2632   1.0000   0.1000  10.0000     
+ Cl   1.7140   1.0000  35.4500   1.9139   0.2000   0.3500  -1.0000   7.0000     
+     11.5345  10.1330   1.0000   0.0000   0.0000   9.9704   6.1703   0.0000     
+     -1.0000   1.2769 143.1770   6.2293   5.2294   0.1542   2.2882   0.0000     
+    -10.2080   2.9867   1.0338   6.2998   2.5791   1.0000   0.1000  10.0000     
+ F    1.2100   1.0000  18.9984   1.8601   0.1200   0.3000  -0.1000   7.0000     
+     11.5000   7.5000   4.0000   9.2533   0.2000   9.0000  15.0000   0.0000     
+     -1.0000  35.0000   1.5000   6.9821   4.1799   1.0561   2.2882   0.0000     
+     -7.3000   2.6656   1.0493   4.0000   2.9225   1.0000   0.1000  10.0000     
+ Ca   1.9058   2.0000  40.0870   2.3698   0.3719   0.6038  -1.0000   2.0000     
+      9.8681   5.0000   3.0000  38.0000   0.0000  -5.7471   7.3556   0.0000     
+     -1.3000   0.0000 220.0000  49.9248   0.3370   0.0000   3.7000   0.0000     
+     -3.9745   3.0069   1.0564   8.0000   2.9663   0.5000   0.1000  10.0000     
+ X   -0.1000   2.0000   1.0080   2.0000   0.0000   1.0000  -0.1000   6.0000     
+     10.0000   2.5000   4.0000   0.0000   0.0000  -0.1000  25.0000   0.0000     
+     -0.1000   0.0000  -2.3700   8.7410  13.3640   0.6690   0.1000   0.0000     
+    -11.0000   2.7466   1.0338   4.0000   2.8793   1.0000   0.1000  10.0000     
+ 62     ! Nr of bonds; Edis1;LPpen;n.u.;pbe1;pbo5;13corr;pbo6                   
+                         pbe2;pbo3;pbo4;Etrip;pbo1;pbo2;ovcorr                  
+  1  1  78.4266 115.3834  68.1631   0.5777  -0.2901   1.0000  34.9989   0.5101  
+         3.8560  -0.1640   8.2326   1.0000  -0.0585   6.7997   1.0000   0.0000  
+  1  2 191.4398   0.0000   0.0000  -0.6539   0.0000   1.0000   6.0000   0.4898  
+         6.3962   1.0000   0.0000   1.0000  -0.0607   6.9960   0.0000   0.0000  
+  2  2 161.2898   0.0000   0.0000  -0.2387   0.0000   1.0000   6.0000   0.6279  
+        13.2089   1.0000   0.0000   1.0000  -0.1674   6.9118   0.0000   0.0000  
+  1  3 170.8787 152.2997 125.6008   0.2914  -0.2305   1.0000  16.7601   0.8645  
+         3.9615  -0.5703   7.4065   1.0000  -0.2514   4.5085   0.0000   0.0000  
+  3  3 222.6268  93.7249  50.8293   0.5588  -0.1000   1.0000  29.7503   0.0125  
+         0.5865  -0.2030   9.1777   1.0000  -0.1357   6.4747   1.0000   0.0000  
+  1  4 163.8300 145.4458  89.6879  -1.3368  -0.3468   1.0000  27.5160   0.1575  
+         0.1817  -0.3114   7.1789   1.0000  -0.2345   4.5111   1.0000   0.0000  
+  3  4 130.8596 169.4551  40.0000   0.3837  -0.1639   1.0000  35.0000   0.2000  
+         1.0000  -0.3579   7.0004   1.0000  -0.1193   6.8773   1.0000   0.0000  
+  4  4 157.9384  82.5526 152.5336   0.4010  -0.1034   1.0000  12.4261   0.5828  
+         0.1578  -0.1509  11.9186   1.0000  -0.0861   5.4271   1.0000   0.0000  
+  2  3 143.3409   0.0000   0.0000  -0.4886   0.0000   1.0000   6.0000   0.3927  
+         1.3588   1.0000   0.0000   0.0000  -0.0781   4.0330   0.0000   0.0000  
+  2  4 210.1187   0.0000   0.0000  -0.3705   0.0000   1.0000   6.0000   0.3284  
+         5.8196   1.0000   0.0000   1.0000  -0.1104   5.5184   0.0000   0.0000  
+  1  5 128.9942  74.5848  55.2528   0.1035  -0.5211   1.0000  18.9617   0.6000  
+         0.2949  -0.2398   8.1175   1.0000  -0.1029   5.6731   1.0000   0.0000  
+  2  5 151.5159   0.0000   0.0000  -0.4721   0.0000   1.0000   6.0000   0.6000  
+         9.4366   1.0000   0.0000   1.0000  -0.0290   7.0050   1.0000   0.0000  
+  3  5   0.0000   0.0000   0.0000   0.5563  -0.4038   1.0000  49.5611   0.6000  
+         0.4259  -0.4577  12.7569   1.0000  -0.1100   7.1145   1.0000   0.0000  
+  4  5   0.0000   0.0000   0.0000   0.4438  -0.2034   1.0000  40.3399   0.6000  
+         0.3296  -0.3153   9.1227   1.0000  -0.1805   5.6864   1.0000   0.0000  
+  5  5  96.1871  93.7006  68.6860   0.0955  -0.4781   1.0000  17.8574   0.6000  
+         0.2723  -0.2373   9.7875   1.0000  -0.0950   6.4757   1.0000   0.0000  
+  2  6  58.6896   0.0000   0.0000  -0.0203  -0.1418   1.0000  13.1260   0.0230  
+         8.2136  -0.1310   0.0000   1.0000  -0.2692   6.4254   0.0000  24.4461  
+  3  6  87.0227   0.0000  43.3991   0.0030  -0.3000   1.0000  36.0000   0.0250  
+         0.0087  -0.2500  12.0000   1.0000  -0.0439   6.6073   1.0000  24.4461  
+  6  6  32.3808   0.0000   0.0000  -0.0076  -0.2000   0.0000  16.0000   0.2641  
+         4.8726  -0.2000  10.0000   1.0000  -0.0729   4.6319   0.0000   0.0000  
+  1  7 110.0000  92.0000   0.0000   0.2171  -0.1418   1.0000  13.1260   0.6000  
+         0.3601  -0.1310  10.7257   1.0000  -0.0869   5.3302   1.0000   0.0000  
+  2  7   0.1466   0.0000   0.0000   0.2250  -0.1418   1.0000  13.1260   0.6000  
+         0.3912  -0.1310   0.0000   1.0000  -0.1029   9.3302   0.0000   0.0000  
+  3  7 201.0058 194.1410   0.0000   1.0000  -0.5000   1.0000  25.0000   0.4873  
+         0.4358  -0.1571  15.8745   1.0000  -0.2431   6.3823   1.0000   0.0000  
+  4  7 130.0000   0.0000   0.0000   0.2171  -0.1418   1.0000  13.1260   0.6000  
+         0.3601  -0.1310  10.7257   1.0000  -0.0869   5.3302   1.0000   0.0000  
+  6  7   0.1000   0.0000   0.0000   0.2500  -0.5000   1.0000  35.0000   0.6000  
+         0.5000  -0.5000  20.0000   1.0000  -0.2000  10.0000   1.0000   0.0000  
+  7  7   0.0000   0.0000   0.0000   0.2171  -0.5000   1.0000  35.0000   0.6000  
+         0.5000  -0.5000  20.0000   1.0000  -0.2000  10.0000   1.0000   0.0000  
+  1  8   0.0000   0.0000   0.0000  -1.0000  -0.3000   1.0000  36.0000   0.7000  
+        10.1151  -0.3500  25.0000   1.0000  -0.1053   8.2003   1.0000   0.0000  
+  2  8   0.0000   0.0000   0.0000  -1.0000  -0.3000   1.0000  36.0000   0.7000  
+        10.1151  -0.3500  25.0000   1.0000  -0.1053   8.2003   1.0000   0.0000  
+  3  8  45.8933   0.0000   0.0000  -0.1511  -0.3000   1.0000  36.0000   0.3105  
+         5.8448  -0.3500  25.0000   1.0000  -0.0659   7.9140   1.0000   0.0000  
+  4  8   0.0000   0.0000   0.0000  -1.0000  -0.3000   1.0000  36.0000   0.7000  
+        10.1151  -0.3500  25.0000   1.0000  -0.1053   8.2003   1.0000   0.0000  
+  5  8   0.0000   0.0000   0.0000  -1.0000  -0.3000   1.0000  36.0000   0.7000  
+        10.1151  -0.3500  25.0000   1.0000  -0.1053   8.2003   1.0000   0.0000  
+  6  8   0.0000   0.0000   0.0000   0.2500  -0.5000   1.0000  35.0000   0.6000  
+         0.5000  -0.5000  20.0000   1.0000  -0.2000  10.0000   1.0000   0.0000  
+  7  8   0.0000   0.0000   0.0000   0.2500  -0.5000   1.0000  35.0000   0.6000  
+         0.5000  -0.5000  20.0000   1.0000  -0.2000  10.0000   1.0000   0.0000  
+  8  8  64.4508   0.0000   0.0000  -0.3738   0.3000   0.0000  25.0000   0.2158  
+         0.9915  -0.4000  12.0000   1.0000  -0.0515   5.0000   0.0000   0.0000  
+  4  6  50.0000  10.0901   0.0000  -1.0000  -0.3000   1.0000  36.0000   0.7058  
+         0.8567  -0.3487  17.4990   1.0000  -0.0794   8.2232   1.0000   0.0000  
+  1  9   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  2  9   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  3  9 130.5629  37.6984   0.0000   0.9228  -0.3000   0.0000  36.0000   0.0850  
+         0.1150  -0.2818  16.1571   1.0000  -0.1343   6.8264   0.0000   0.0000  
+  4  9 130.5629  37.6984   0.0000   0.9228  -0.3000   0.0000  36.0000   0.0850  
+         0.1150  -0.2818  16.1571   1.0000  -0.1343   6.8264   0.0000   0.0000  
+  5  9   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  6  9   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  7  9   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  8  9   0.1000   0.0000   0.0000   0.2500  -0.5000   1.0000  35.0000   0.6000  
+         0.5000  -0.5000  20.0000   1.0000  -0.2000  10.0000   1.0000   0.0000  
+  9  9  80.1930   0.0000   0.0000  -0.8469  -0.2000   0.0000  16.0000   0.2022  
+         0.7528  -0.1924  14.9725   1.0000  -0.0885   5.0000   0.0000   0.0000  
+  1 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  2 10  98.9788   0.0000   0.0000  -0.0572  -0.2000   0.0000  16.0000   1.1523  
+         2.2822  -0.2000  15.0000   1.0000  -0.1093   5.1686   0.0000   0.0000  
+  3 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  4 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  5 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  6 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  7 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  8 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+  9 10   0.0000   0.0000   0.0000   0.5000  -0.2000   0.0000  16.0000   0.5000  
+         1.0001  -0.2000  15.0000   1.0000  -0.1000  10.0000   0.0000   0.0000  
+ 10 10   0.2500   0.0000   0.0000   0.1803  -0.2000   0.0000  16.0000   0.3356  
+         0.9228  -0.2000  15.0000   1.0000  -0.1178   5.6715   0.0000   0.0000  
+  1 11 237.8781   0.0000   0.0000  -0.7438  -0.5000   1.0000  35.0000   1.0460  
+         3.6661  -0.2500  15.0000   1.0000  -0.0800   5.4719   1.0000   0.0000  
+  2 11   0.0000   0.0000   0.0000  -0.4643   0.0000   1.0000   6.0000   0.6151  
+        12.3710   1.0000   0.0000   1.0000  -0.1008   8.5980   0.0000   0.0000  
+  3 11   0.0000   0.0000   0.0000  -0.4643   0.0000   1.0000   6.0000   0.6151  
+        12.3710   1.0000   0.0000   1.0000  -0.1008   8.5980   0.0000   0.0000  
+  4 11   0.0000   0.0000   0.0000  -0.4643   0.0000   1.0000   6.0000   0.6151  
+        12.3710   1.0000   0.0000   1.0000  -0.1008   8.5980   0.0000   0.0000  
+  5 11   0.0000   0.0000   0.0000  -0.4643   0.0000   1.0000   6.0000   0.6151  
+        12.3710   1.0000   0.0000   1.0000  -0.1008   8.5980   0.0000   0.0000  
+ 11 11 250.0765   0.0000   0.0000   0.2298  -0.3500   1.0000  25.0000   0.8427  
+         0.1167  -0.2500  15.0000   1.0000  -0.1506   7.3516   1.0000   0.0000  
+  1 12   0.0000   0.0000   0.0000  -0.0203  -0.1418   1.0000  13.1260   0.0230  
+         8.2136  -0.2500  20.0000   1.0000  -0.2692   6.4254   0.0000  24.4461  
+  2 12   0.0000   0.0000   0.0000  -0.0203  -0.1418   1.0000  13.1260   0.0230  
+         8.2136  -0.2500  20.0000   1.0000  -0.2692   6.4254   0.0000  24.4461  
+  3 12  49.4055   0.0000   0.0000   0.9603  -0.3000   0.0000  36.0000   0.0025  
+         0.4232  -0.2500  12.0000   1.0000  -0.1619   9.6512   0.0000  24.4461  
+ 12 12  22.8272   0.0000   0.0000   0.6166  -0.2000   0.0000  16.0000   0.8225  
+         1.0000  -0.2000  10.0000   1.0000  -0.0831   4.2291   0.0000   0.0000  
+ 26    ! Nr of off-diagonal terms; Ediss;Ro;gamma;rsigma;rpi;rpi2               
+  1  2   0.1240   1.6326   9.8721   1.1578  -1.0000  -1.0000                    
+  2  3   0.0295   1.3181  10.1225   0.9069  -1.0000  -1.0000                    
+  2  4   0.1294   1.3025   9.8751   1.0415  -1.0000  -1.0000                    
+  1  3   0.2287   2.0452   9.0166   1.4046   1.1866   1.0375                    
+  1  4   0.2000   1.8828   9.7673   1.3387   1.2578   1.1539                    
+  3  4   0.1001   2.3274   9.0974   1.5236   1.0493   1.2531                    
+  1  5   0.1408   1.8161   9.9393   1.7986   1.3021   1.4031                    
+  2  5   0.0895   1.6239  10.0104   1.4640  -1.0000  -1.0000                    
+  3  5   0.1022   1.9887  10.0605   1.5799   1.4000  -1.0000                    
+  4  5   0.1505   1.9000  10.5104   1.8000   1.4000  -1.0000                    
+  2  6   0.0100   1.6000  13.2979   1.8670  -1.0000  -1.0000                    
+  3  6   0.0809   1.7000  11.4606   1.5177  -1.0000  -1.0000                    
+  3  7   0.0534   1.7520  10.4281   1.8000   1.4498  -1.0000                    
+  6  7   0.1801   1.8566   9.8498   0.1000  -1.0000  -1.0000                    
+  3  8   0.0825   1.5904  11.3396   1.5905  -1.0000  -1.0000                    
+  2  9   0.1750   1.7939  13.5000   0.0100  -1.0000  -1.0000                    
+  3  9   0.1200   1.8000  10.5000   1.6526   1.4718  -1.0000                    
+  1  9   0.2950   2.2000  11.0937   0.0100  -1.0000  -1.0000                    
+  2 10   0.0376   1.6671   9.6285   1.2123  -1.0000  -1.0000                    
+  3 10   0.1945   2.2766  11.2353  -1.0000  -1.0000  -1.0000                    
+  1 11   0.1071   1.6243  11.0402   1.3176  -1.0000  -1.0000                    
+  2 11   0.0431   1.7204  10.3632   0.5386  -1.0000  -1.0000                    
+  4  9   0.1200   1.8000  10.5000   1.6526   1.4718  -1.0000                    
+  1 12   0.2000   1.5000  14.0000   0.0010   0.0010  -1.0000                    
+  2 12   0.0100   1.0610   9.7343   0.0010   0.0010  -1.0000                    
+  3 12   0.1515   1.8913  12.5160   2.0022  -1.0000  -1.0000                    
+ 98    ! Nr of angles;at1;at2;at3;Thetao,o;ka;kb;pv1;pv2                        
+  1  1  1  74.4118  38.0306   0.9605   0.0000   0.0100  36.6918   2.3203        
+  1  1  2  67.2765  20.1739   3.3306   0.0000   0.0100   0.0000   1.1630        
+  2  1  2  74.7224  38.7524   2.1423   0.0000   0.8474   0.0000   1.3144        
+  1  2  2   0.0000   0.0000   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  2  1   0.0000   3.4110   7.7350   0.0000   0.0000   0.0000   1.0400        
+  2  2  2   0.0000  27.9213   5.8635   0.0000   0.0000   0.0000   1.0400        
+  1  1  3  61.1001  38.5283   0.4917   0.0000   4.2373   0.0000   2.1649        
+  3  1  3  72.0708  27.2877   2.3068 -10.4517   0.1000   0.0000   1.6107        
+  1  1  4  64.7353  38.2645   1.1478   0.0000   1.1834   0.0000   2.8465        
+  3  1  4  81.0672  41.9015   0.4878   0.0000   1.1019   0.0000   1.0000        
+  4  1  4  89.7621  43.0000   0.5895   0.0000   1.1155   0.0000   1.0000        
+  2  1  3  61.3200  29.1602   0.9036   0.0000   1.8063   0.0000   1.4867        
+  2  1  4  68.7361  36.7162   1.6697   0.0000   0.2000   0.0000   3.0000        
+  1  2  4   0.0000   0.0019   6.3000   0.0000   0.0000   0.0000   1.0400        
+  1  3  1  74.2533  41.5372   0.4237   0.0000   2.3660   0.0000   1.0319        
+  1  3  3  82.8809  21.0869   3.0902   0.0000   4.5310  20.1072   1.0105        
+  1  3  4  70.3730  45.0000   1.4731   0.0000   2.9000   0.0000   2.4464        
+  3  3  3  77.3022  33.4558   1.4033   0.0000   3.9048   0.0000   1.0000        
+  3  3  4  77.0669  27.6795   1.6466   0.0000   2.9000   0.0000   1.5085        
+  4  3  4  68.8583  40.4712   1.8369   0.0000   3.0072   0.0000   1.5773        
+  1  3  2  90.0000  34.3702   0.4674   0.0000   0.6101   0.0000   1.0013        
+  2  3  3  90.0000  26.3185   8.0000   0.0000   0.8615   0.0000   1.0579        
+  2  3  4  68.3253  36.1953   7.5000   0.0000   0.1000   0.0000   1.0000        
+  2  3  2  80.7909  19.0967   1.0887   0.0000   2.0594   0.0000   1.7720        
+  1  4  1  71.2077  14.1180   3.3944   0.0000   2.8702   0.0000   1.2651        
+  1  4  3  76.1064  23.7583   1.6308   0.0000   2.8701   0.0000   1.6732        
+  1  4  4  71.3624  12.8120   3.1458   0.0000   2.8701   0.0000   1.1896        
+  3  4  3  74.2922  23.0742   2.6248 -18.0069   3.0701   0.0000   1.6278        
+  3  4  4  74.0840  31.1381   1.5175  -0.9193   3.0117   0.0000   1.3541        
+  4  4  4  76.0945  32.1176   1.7767   0.0000   2.9983   0.0000   1.9677        
+  1  4  2  69.1892  14.8553   2.7174   0.0000   0.2025   0.0000   1.3071        
+  2  4  3  74.5555  45.0000   1.1948   0.0000   0.3956   0.0000   3.0000        
+  2  4  4  78.8758  45.0000   0.5964   0.0000   0.5437   0.0000   1.0000        
+  2  4  2  81.5738   7.0792   7.5000   0.0000   0.1000   0.0000   1.0000        
+  1  2  3   0.0000  21.4989   1.0000   0.0000   0.1000   0.0000   1.1358        
+  1  2  4   0.0000   0.0100   2.4974   0.0000   0.0000   0.0000   1.3777        
+  1  2  5   0.0000  15.0000   3.0000   0.0000   0.0000   0.0000   1.0400        
+  3  2  3   0.0000  15.0000   0.5640   0.0000   0.0000   0.0000   1.0400        
+  3  2  4   0.0000   1.0235   0.1000   0.0000   0.0000   0.0000   3.0000        
+  4  2  4   0.0000   0.0100   1.3170   0.0000   0.0000   0.0000   2.1165        
+  2  2  3   0.0000   2.0000   0.0839   0.0000   0.0000   0.0000   2.9374        
+  2  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  1  5  74.4180  33.4273   1.7018   0.1463   0.5000   0.0000   1.6178        
+  1  5  1  79.7037  28.2036   1.7073   0.1463   0.5000   0.0000   1.6453        
+  2  1  5  63.3289  29.4225   2.1326   0.0000   0.5000   0.0000   3.0000        
+  1  5  2  85.9449  38.3109   1.2492   0.0000   0.5000   0.0000   1.1000        
+  1  5  5  85.6645  40.0000   2.9274   0.1463   0.5000   0.0000   1.3830        
+  2  5  2  83.8555   5.1317   0.4377   0.0000   0.5000   0.0000   3.0000        
+  2  5  5  97.0064  32.1121   2.0242   0.0000   0.5000   0.0000   2.8568        
+  3  5  3  81.0926  30.2268   6.4132  -5.4471   2.5968   0.0000   3.0000        
+  1  5  3  70.0000  35.0000   3.4223   0.0000   1.3550   0.0000   1.2002        
+  1  3  5  57.3353  41.0012   1.0609   0.0000   1.3000   0.0000   3.0000        
+  3  3  5  83.9753  31.0715   3.5590   0.0000   0.8161   0.0000   1.1776        
+  2  3  5  89.8843  17.5000   3.3660   0.0000   2.0000   0.0000   2.0734        
+  2  6  2   0.0000  49.8261   0.2093   0.0000   2.0870   0.0000   2.2895        
+  2  2  6   0.0000  40.0366   3.1505   0.0000   1.1296   0.0000   1.1110        
+  6  2  6   0.0000   0.5047   0.8000   0.0000   0.8933   0.0000   4.6650        
+  2  6  6   0.0000   8.7037   0.0827   0.0000   3.5597   0.0000   1.1198        
+  3  6  3   0.0000   9.2317   0.1000   0.0000   1.0000   0.0000   1.0920        
+  6  3  6   0.0008  25.0000   8.0000   0.0000   1.0000   0.0000   3.0000        
+  2  3  6  66.0423   5.0000   1.0000   0.0000   1.0000   0.0000   1.2500        
+  2  6  3   0.0000   0.5000   0.1000   0.0000   1.0000   0.0000   3.0000        
+  3  3  6  70.0000  20.0000   1.0000   0.0000   1.0000   0.0000   1.2500        
+  3  7  3  90.0000  18.4167   0.6799  -8.0000   0.1310   0.0000   2.2321        
+  2  3  7  72.6004   9.6150   0.8905   0.0000   3.5473   0.0000   1.0400        
+  3  3  7  60.0000  40.0000   4.0000   0.0000   1.0000   0.0000   1.0400        
+  3  2  7   0.0000  10.0000   1.0000   0.0000   1.0000   0.0000   1.0400        
+  6  3  7  41.0995   3.2207   7.3523   0.0000   0.1101   0.0000   1.0947        
+  7  3  7  62.1312   7.5931   0.1000   0.0000   0.5154   0.0000   2.1744        
+  1  3  7  74.1394   8.5687   1.7132   0.0000  -0.6553   0.0000   2.2323        
+  2  7  3  75.0000  25.0000   2.0000   0.0000   1.0000   0.0000   1.2500        
+  3  7  7  70.0000  25.0000   2.0000   0.0000   1.0000   0.0000   1.2500        
+  3  9  3  90.0000  30.4624   2.1468   0.0000   0.0500   0.0000   1.9485        
+  9  3  9  90.0000   5.7486   5.0000   0.0000   2.0000   0.0000   1.1000        
+  3  3  9  62.9344  15.0215   4.3743   0.0000   0.6168   0.0000   1.1673        
+  3  9  9  33.7127   8.0623   3.4580   0.0000   0.0500   0.0000   2.6065        
+  2  3  9  90.0000   9.7766   8.0000   0.0000   0.0505   0.0000   1.7257        
+  1  3  9  90.0000  11.2108   1.4880   0.0000   0.5386   0.0000   2.1105        
+  3  2 10   0.0000   0.0100   0.0100   0.0000   0.0000   0.0000   1.1456        
+ 11  1 11  77.8443  49.0744   5.9913   0.0000   0.7835   0.0000   2.3020        
+  1 11  1   0.0000  19.9962   3.2299   0.0000   2.1012   0.0000   1.1537        
+  1 11 11   0.0000  25.0000   1.0000   0.0000   1.0000   0.0000   1.0400        
+ 11  1  2  69.6421  10.0000   2.0000   0.0000   1.0000   0.0000   1.0400        
+  4  9  4  90.0000  30.4624   2.1468   0.0000   0.0500   0.0000   1.9485        
+  3  9  4  90.0000  30.4624   2.1468   0.0000   0.0500   0.0000   1.9485        
+  9  4  9  90.0000   5.7486   5.0000   0.0000   2.0000   0.0000   1.1000        
+  3  4  9  62.9344  15.0215   4.3743   0.0000   0.6168   0.0000   1.1673        
+  4  3  9  62.9344  15.0215   4.3743   0.0000   0.6168   0.0000   1.1673        
+  4  4  9  62.9344  15.0215   4.3743   0.0000   0.6168   0.0000   1.1673        
+  4  4  9  62.9344  15.0215   4.3743   0.0000   0.6168   0.0000   1.1673        
+  4  9  9  33.7127   8.0623   3.4580   0.0000   0.0500   0.0000   2.6065        
+  2  4  9  90.0000   9.7766   8.0000   0.0000   0.0505   0.0000   1.7257        
+  1  4  9  90.0000  11.2108   1.4880   0.0000   0.5386   0.0000   2.1105        
+  3 12  3  90.0000   5.2360   6.0000   0.0000   1.9491   0.0000   1.0000        
+ 12  3 12  27.0723   4.9264   1.7778   0.0000   0.3851   0.0000   1.4855        
+  2  3 12   1.0000   1.5989   6.0000   0.0000   0.6668   0.0000   1.0000        
+  3  3 12  90.0000  10.0000   1.0000   0.0000   1.0000   0.0000   2.0000        
+  1  3 12  70.0000   0.0000   1.0000   0.0000   1.0000   0.0000   2.0000        
+ 73    ! Nr of torsions;at1;at2;at3;at4;;V1;V2;V3;V2(BO);vconj;n.u;n            
+  1  1  1  1   0.0734  56.5103   0.0000  -5.5375  -2.6113   0.0000   0.0000     
+  1  1  1  2   0.2335  44.4534   0.5000  -5.5234  -3.0000   0.0000   0.0000     
+  2  1  1  2  -0.0100  59.3937   0.5000  -8.0000  -3.0000   0.0000   0.0000     
+  1  1  1  3  -0.1605   9.1977  -1.0000  -2.5000  -2.1235   0.0000   0.0000     
+  2  1  1  3   0.4631  41.1330   0.7425  -7.5125  -1.1040   0.0000   0.0000     
+  3  1  1  3  -1.0000  30.3068  -1.0000  -8.5000  -0.0100   0.0000   0.0000     
+  1  1  3  1   1.0000  45.3654  -0.7715  -3.2711  -0.0100   0.0000   0.0000     
+  1  1  3  2   1.0000 150.0000   1.0000  -6.6407  -0.0100   0.0000   0.0000     
+  2  1  3  1   1.0000 143.9644   0.6834  -6.1023  -1.1889   0.0000   0.0000     
+  2  1  3  2  -1.0000  74.4134   1.0000  -3.3595  -3.5000   0.0000   0.0000     
+  1  1  3  3   1.0000 133.2711   0.5065  -2.5561  -0.0100   0.0000   0.0000     
+  2  1  3  3   0.2238  53.9090   0.8685  -2.7064  -3.5000   0.0000   0.0000     
+  3  1  3  1  -1.0000  66.2620   0.7534  -3.3533  -2.2258   0.0000   0.0000     
+  3  1  3  2   0.6489  19.7090  -1.0000  -3.3466  -0.1540   0.0000   0.0000     
+  3  1  3  3  -1.0000  86.4981   1.0000  -3.5833  -0.0610   0.0000   0.0000     
+  1  3  3  1   1.0000   0.1000   1.0000  -6.9024  -0.0100   0.0000   0.0000     
+  1  3  3  2   0.9093  23.6198  -0.1431  -2.4750  -3.5000   0.0000   0.0000     
+  2  3  3  2  -1.6122   5.0000  -1.0000  -2.5394  -0.9921   0.0000   0.0000     
+  1  3  3  3   2.5000   2.1016   0.9647  -2.6000  -0.9972   0.0000   0.0000     
+  2  3  3  3  -2.5000  75.7606  -0.6120  -7.8633  -1.2407   0.0000   0.0000     
+  3  3  3  3  -0.5000   5.0000   1.0000  -2.5000  -0.9000   0.0000   0.0000     
+  1  1  4  2   0.2700  43.1323  -0.4952  -7.6538  -1.9825   0.0000   0.0000     
+  2  1  4  2  -0.5047  82.9784   0.8701  -7.6680  -2.1051   0.0000   0.0000     
+  3  1  4  2   0.8306  15.5307   1.0000  -2.5000  -2.5261   0.0000   0.0000     
+  3  1  1  4  -0.8051  19.8307   1.0000  -3.7979  -0.9511   0.0000   0.0000     
+  4  1  1  4   1.0000  36.1913   1.0000  -3.4095  -1.7241   0.0000   0.0000     
+  1  1  4  1   1.0000  32.6616   0.3481  -6.4524  -1.6589   0.0000   0.0000     
+  3  1  4  1  -1.0000  -5.0000   1.0000  -2.5000  -1.8038   0.0000   0.0000     
+  2  1  1  4   0.7529  50.8010  -0.5000  -4.3471  -1.9000   0.0000   0.0000     
+  4  1  4  2   0.3787  13.7301   0.6579  -8.2500  -2.0202   0.0000   0.0000     
+  2  1  4  1  -1.0000  76.7186   0.1194  -8.0000  -1.5996   0.0000   0.0000     
+  0  1  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  3  0   0.0000   0.1000   0.0200  -2.5415   0.0000   0.0000   0.0000     
+  0  1  1  0   0.0000  60.0000   0.3000  -4.0000  -2.0000   0.0000   0.0000     
+  0  3  3  0   0.5511  25.4150   1.1330  -5.1903  -1.0000   0.0000   0.0000     
+  0  1  4  0   0.2176  40.4126   0.3535  -3.9875  -2.0051   0.0000   0.0000     
+  0  2  4  0   0.0000   0.1032   0.3000  -5.0965   0.0000   0.0000   0.0000     
+  0  3  4  0   1.1397  61.3225   0.5139  -3.8507  -2.7831   0.0000   0.0000     
+  0  4  4  0   0.7265  44.3155   1.0000  -4.4046  -2.0000   0.0000   0.0000     
+  4  1  4  4  -0.0949   8.7582   0.3310  -7.9430  -2.0000   0.0000   0.0000     
+  0  1  5  0   4.0885  78.7058   0.1174  -2.1639   0.0000   0.0000   0.0000     
+  0  5  5  0  -0.0170 -56.0786   0.6132  -2.2092   0.0000   0.0000   0.0000     
+  0  2  5  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  2  3  5  3   2.5000   2.5000   0.2237 -10.0000  -1.0000   0.0000   0.0000     
+  0  3  5  0  -2.5000  50.0000  -0.5000 -10.0000  -1.0000   0.0000   0.0000     
+  0  6  6  0   0.0000   0.0000   0.1200  -2.4426   0.0000   0.0000   0.0000     
+  0  2  6  0   0.0000   0.0000   0.1200  -2.4847   0.0000   0.0000   0.0000     
+  0  3  6  0   0.0000   0.0000   0.1200  -2.4703   0.0000   0.0000   0.0000     
+  1  1  1  7  -0.3232  14.3871   0.1823  -9.8682  -1.7255   0.0000   0.0000     
+  7  1  1  7  -0.1452  50.0000  -0.1915  -8.0773  -1.7255   0.0000   0.0000     
+  0  1  7  0   4.0000  45.8264   0.9000  -4.0000   0.0000   0.0000   0.0000     
+  0  7  7  0   4.0000  45.8264   0.9000  -4.0000   0.0000   0.0000   0.0000     
+  2  1  3  7  -1.5000  18.9285   0.3649  -6.1208   0.0000   0.0000   0.0000     
+  2  3  7  3   1.5000  -1.0000   0.2575  -6.2100   0.0000   0.0000   0.0000     
+  1  3  7  3  -1.4375  -0.8700   0.9861  -2.5424   0.0000   0.0000   0.0000     
+  7  3  7  3  -1.5000  21.5086  -1.0000  -4.8869   0.0000   0.0000   0.0000     
+  2  1  3  9   0.1714  69.9743   0.9170  -7.9557   0.0000   0.0000   0.0000     
+  1  1  3  9   0.2500  76.5218   1.0000  -2.5503   0.0000   0.0000   0.0000     
+  3  1  3  9  -0.2500  50.5929  -0.2500  -6.9285   0.0000   0.0000   0.0000     
+  2  3  9  3  -0.2500   0.0100  -0.5000  -4.6984   0.0000   0.0000   0.0000     
+  1  1  1 11   0.5000   0.1000   0.4683 -11.5274  -1.7255   0.0000   0.0000     
+  2  1  1 11   0.0000  49.3871   0.2000 -10.5765  -1.7255   0.0000   0.0000     
+ 11  1  1 11  -0.5000  95.4727  -0.2080  -4.8579  -1.7255   0.0000   0.0000     
+  0  1 11  0   4.0000  45.8264   0.9000  -4.0000   0.0000   0.0000   0.0000     
+  0 11 11  0   4.0000  45.8264   0.8897  -4.0000   0.0000   0.0000   0.0000     
+  2  1  4  9   0.1714  69.9743   0.9170  -7.9557   0.0000   0.0000   0.0000     
+  1  1  4  9   0.2500  76.5218   1.0000  -2.5503   0.0000   0.0000   0.0000     
+  3  1  4  9  -0.2500  50.5929  -0.2500  -6.9285   0.0000   0.0000   0.0000     
+  4  1  4  9  -0.2500  50.5929  -0.2500  -6.9285   0.0000   0.0000   0.0000     
+  4  1  3  9  -0.2500  50.5929  -0.2500  -6.9285   0.0000   0.0000   0.0000     
+  2  4  9  3  -0.2500   0.0100  -0.5000  -4.6984   0.0000   0.0000   0.0000     
+  2  4  9  4  -0.2500   0.0100  -0.5000  -4.6984   0.0000   0.0000   0.0000     
+  0    ! Nr of hydrogen bonds;at1;at2;at3;Rhb;Dehb;vhb1                         
diff --git a/data/benchmarks/water/ffield_acks2.water b/data/benchmarks/water/ffield_acks2.water
new file mode 100644
index 0000000000000000000000000000000000000000..ddf2d3cbf5f8c76d3203ee4a326ee8db23fb246e
--- /dev/null
+++ b/data/benchmarks/water/ffield_acks2.water
@@ -0,0 +1,364 @@
+Reactive MD-force field: Water                                                  
+ 39       ! Number of general parameters                                        
+   50.0000 !Overcoordination parameter                                          
+    9.5469 !Overcoordination parameter                                          
+   26.5405 !Valency angle conjugation parameter                                 
+    1.7224 !Triple bond stabilisation parameter                                 
+    6.8702 !Triple bond stabilisation parameter                                 
+   60.4850 !C2-correction                                                       
+    1.0588 !Undercoordination parameter                                         
+    4.6000 !Triple bond stabilisation parameter                                 
+   12.1176 !Undercoordination parameter                                         
+   13.3056 !Undercoordination parameter                                         
+  -70.5044 !Triple bond stabilization energy                                    
+    0.0000 !Lower Taper-radius                                                  
+   10.0000 !Upper Taper-radius                                                  
+    2.8793 !Not used                                                            
+   33.8667 !Valency undercoordination                                           
+    6.0891 !Valency angle/lone pair parameter                                   
+    1.0563 !Valency angle                                                       
+    2.0384 !Valency angle parameter                                             
+    6.1431 !Not used                                                            
+    6.9290 !Double bond/angle parameter                                         
+    0.3989 !Double bond/angle parameter: overcoord                              
+    3.9954 !Double bond/angle parameter: overcoord                              
+   -2.4837 !Not used                                                            
+    5.7796 !Torsion/BO parameter                                                
+   10.0000 !Torsion overcoordination                                            
+    1.9487 !Torsion overcoordination                                            
+   -1.2327 !Conjugation 0 (not used)                                            
+    2.1645 !Conjugation                                                         
+    1.5591 !vdWaals shielding                                                   
+    0.1000 !Cutoff for bond order (*100)                                        
+    2.1365 !Valency angle conjugation parameter                                 
+    0.6991 !Overcoordination parameter                                          
+   50.0000 !Overcoordination parameter                                          
+    1.8512 !Valency/lone pair parameter                                         
+  548.6451 !Softness                                                            
+   20.0000 !Not used                                                            
+    5.0000 !Molecular energy (not used)                                         
+    0.0000 !Molecular energy (not used)                                         
+    2.6962 !Valency angle conjugation parameter                                 
+ 15    ! Nr of atoms; cov.r; valency;a.m;Rvdw;Evdw;gammaEEM;cov.r2;#            
+            alfa;gammavdW;valency;Eunder;Eover;chiEEM;etaEEM;n.u.               
+            cov r3;Elp;Heat inc.;n.u.;n.u.;n.u.;n.u.                            
+            ov/un;val1;n.u.;val3,vval4                                          
+ C    1.3817   4.0000  12.0000   1.8903   0.1838   0.9000   1.1341   4.0000     
+      9.7559   2.1346   4.0000  34.9350  79.5548   5.9666   7.0000   0.0000     
+      1.2114   0.0000 202.5551   8.9539  34.9289  13.5366   0.8563   0.0000     
+     -2.8983   2.5000   1.0564   4.0000   2.9663   0.0000   0.0000   0.0000     
+ H    0.8930   1.0000   1.0080   1.3550   0.0930   0.8203  -0.1000   1.0000     
+      8.2230  33.2894   1.0000   0.0000 121.1250   3.7248   9.6093   1.0000     
+     -0.1000   0.0000  61.6606   3.0408   2.4197   0.0003   3.4114   0.0000     
+    -19.4571   4.2733   1.0338   1.0000   2.8793   0.0000   0.0000   0.0000     
+ O    1.2450   2.0000  15.9990   2.3890   0.1000   1.0898   1.0548   6.0000     
+      9.7300  13.8449   4.0000  37.5000 116.0768   8.5000   8.3122   2.0000     
+      0.9049   0.4056  59.0626   3.5027   0.7640   0.0021   0.9745   0.0000     
+     -3.5500   2.9000   1.0493   4.0000   2.9225   0.0000   0.0000   0.0000     
+ N    1.2333   3.0000  14.0000   1.9324   0.1376   0.8596   1.1748   5.0000     
+     10.0667   7.8431   4.0000  32.2482 100.0000   6.8418   6.3404   2.0000     
+      1.0433  13.7673 119.9837   2.1961   3.0696   2.7683   0.9745   0.0000     
+     -4.3875   2.6192   1.0183   4.0000   2.8793   0.0000   0.0000   0.0000     
+ S    1.9405   2.0000  32.0600   2.0677   0.2099   1.0336   1.5479   6.0000     
+      9.9575   4.9055   4.0000  52.9998 112.1416   6.5000   8.2545   2.0000     
+      1.4601   9.7177  71.1843   5.7487  23.2859  12.7147   0.9745   0.0000     
+    -11.0000   2.7466   1.0338   6.2998   2.8793   0.0000   0.0000   0.0000     
+ Si   2.0276   4.0000  28.0600   2.2042   0.1322   0.8218   1.5758   4.0000     
+     11.9413   2.0618   4.0000  11.8211 136.4845   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 126.5182   6.4918   8.5961   0.2368   0.8563   0.0000     
+     -3.8112   3.1873   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Pt   1.9907   3.0000 195.0800   1.9980   0.2452   0.8218  -1.0000   3.0000     
+     12.8669   3.2118   3.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 142.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -6.7740   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Zr   2.1000   4.0000  91.2240   2.1970   0.2542   0.8218  -1.0000   4.0000     
+     12.8545   3.5938   4.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 107.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Ni   1.8503   2.0000  58.6900   1.9219   0.1582   0.8218  -1.0000   2.0000     
+     12.1238   4.0351   2.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000  95.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Au   1.8503   1.0000 196.9665   1.9219   0.1582   0.8218  -1.0000   1.0000     
+     12.1238   4.0351   1.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000  72.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ V    2.2657   3.0000  50.9415   1.7992   0.3005   0.6743   0.1000   5.0000     
+     12.3879   5.2243   3.0000   0.0000   0.0000  -0.3628   6.6023   0.0000     
+     -1.0000   0.0000 117.6300  23.1946   6.5795   0.0000   0.8563   0.0000     
+     -3.5389   1.5012   1.0338   3.0000   3.6411   0.0000   0.0000   0.0000     
+ Bi   2.1949   3.0000 208.9804   2.4429   0.1607   0.4960   0.0535   5.0000     
+     12.9571  35.5167   3.0000   0.0000   0.0000  -0.1926   6.4153   0.0000     
+     -1.0000   0.5785  52.6300   3.8978   0.9856   0.0314   0.8563   0.0000     
+     -2.5000   5.0597   1.0338   6.0000   2.5791   0.0000   0.0000   0.0000     
+ Ti   0.1000   4.0000  47.8800   2.0000   0.1659   0.6037   0.1000   4.0000     
+     13.2535   4.0063   4.0000  -5.0000   0.0000  -0.1864   5.9304   0.0000     
+     -1.0000   0.0000 129.6300  22.8461   1.8515   0.0064   0.8563   0.0000     
+     -3.4122   3.2711   1.0338   6.2998   2.2632   0.0000   0.0000   0.0000     
+ Mo   2.4710   5.6504  95.9400   1.8000   0.3285   1.0000   0.1000   6.0000     
+     13.0000  45.0000   4.0000   0.0000   0.0000   0.6062   6.1484   0.0000     
+      0.1000   0.0000 152.6300   3.7659   0.0689   2.9902   0.8563   0.0000     
+    -16.7660   3.1072   1.0338   8.0000   3.4590   0.0000   0.0000   0.0000     
+ X   -0.1000   2.0000   1.0080   2.0000   0.0000   1.0000  -0.1000   6.0000     
+     10.0000   2.5000   4.0000   0.0000   0.0000   8.5000   1.5000   0.0000     
+     -0.1000   0.0000  -2.3700   8.7410  13.3640   0.6690   0.9745   0.0000     
+    -11.0000   2.7466   1.0338   2.0000   2.8793   0.0000   0.0000   0.0000     
+ 40      ! Nr of bonds; Edis1;LPpen;n.u.;pbe1;pbo5;13corr;pbo6                  
+                         pbe2;pbo3;pbo4;n.u.;pbo1;pbo2;ovcorr                   
+  1  1 158.2004  99.1897  78.0000  -0.7738  -0.4550   1.0000  37.6117   0.4147  
+         0.4590  -0.1000   9.1628   1.0000  -0.0777   6.7268   1.0000   0.0000  
+  1  2 169.4760   0.0000   0.0000  -0.6083   0.0000   1.0000   6.0000   0.7652  
+         5.2290   1.0000   0.0000   1.0000  -0.0500   6.9136   0.0000   0.0000  
+  2  2 153.3934   0.0000   0.0000  -0.4600   0.0000   1.0000   6.0000   0.7300  
+         6.2500   1.0000   0.0000   1.0000  -0.0790   6.0552   0.0000   0.0000  
+  1  3 158.6946 107.4583  23.3136  -0.4240  -0.1743   1.0000  10.8209   1.0000  
+         0.5322  -0.3113   7.0000   1.0000  -0.1447   5.2450   0.0000   0.0000  
+  3  3 142.2858 145.0000  50.8293   0.2506  -0.1000   1.0000  29.7503   0.6051  
+         0.3451  -0.1055   9.0000   1.0000  -0.1225   5.5000   1.0000   0.0000  
+  1  4 134.1215 140.2179  79.9745   0.0163  -0.1428   1.0000  27.0617   0.2000  
+         0.1387  -0.3681   7.1611   1.0000  -0.1000   5.0825   1.0000   0.0000  
+  3  4 130.8596 169.4551  40.0000   0.3837  -0.1639   1.0000  35.0000   0.2000  
+         1.0000  -0.3579   7.0004   1.0000  -0.1193   6.8773   1.0000   0.0000  
+  4  4 157.9384  82.5526 152.5336   0.4010  -0.1034   1.0000  12.4261   0.5828  
+         0.1578  -0.1509  11.9186   1.0000  -0.0861   5.4271   1.0000   0.0000  
+  2  3 160.0000   0.0000   0.0000  -0.5725   0.0000   1.0000   6.0000   0.5626  
+         1.1150   1.0000   0.0000   0.0000  -0.0920   4.2790   0.0000   0.0000  
+  2  4 231.8173   0.0000   0.0000  -0.3364   0.0000   1.0000   6.0000   0.4402  
+         8.8910   1.0000   0.0000   1.0000  -0.0327   6.5754   0.0000   0.0000  
+  1  5 128.9942  74.5848  55.2528   0.1035  -0.5211   1.0000  18.9617   0.6000  
+         0.2949  -0.2398   8.1175   1.0000  -0.1029   5.6731   1.0000   0.0000  
+  2  5 151.5159   0.0000   0.0000  -0.4721   0.0000   1.0000   6.0000   0.6000  
+         9.4366   1.0000   0.0000   1.0000  -0.0290   7.0050   1.0000   0.0000  
+  3  5   0.0000   0.0000   0.0000   0.5563  -0.4038   1.0000  49.5611   0.6000  
+         0.4259  -0.4577  12.7569   1.0000  -0.1100   7.1145   1.0000   0.0000  
+  4  5   0.0000   0.0000   0.0000   0.4438  -0.2034   1.0000  40.3399   0.6000  
+         0.3296  -0.3153   9.1227   1.0000  -0.1805   5.6864   1.0000   0.0000  
+  5  5  96.1871  93.7006  68.6860   0.0955  -0.4781   1.0000  17.8574   0.6000  
+         0.2723  -0.2373   9.7875   1.0000  -0.0950   6.4757   1.0000   0.0000  
+  6  6 109.1904  70.8314  30.0000   0.2765  -0.3000   1.0000  16.0000   0.1583  
+         0.2804  -0.1994   8.1117   1.0000  -0.0675   8.2993   0.0000   0.0000  
+  2  6 137.1002   0.0000   0.0000  -0.1902   0.0000   1.0000   6.0000   0.4256  
+        17.7186   1.0000   0.0000   1.0000  -0.0377   6.4281   0.0000   0.0000  
+  3  6 191.1743  52.0733  43.3991  -0.2584  -0.3000   1.0000  36.0000   0.8764  
+         1.0248  -0.3658   4.2151   1.0000  -0.5004   4.2605   1.0000   0.0000  
+  4  6 185.4488  39.2832  43.3991  -0.1922  -0.3000   1.0000  36.0000   0.8217  
+         0.8538  -0.3887   4.4334   1.0000  -0.5241   4.4529   1.0000   0.0000  
+  7  7  90.1462   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3484  
+         1.0000  -0.2000  15.0000   1.0000  -0.1014   5.7631   0.0000   0.0000  
+  8  8  85.2900   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.5438  
+         1.0000  -0.2000  15.0000   1.0000  -0.1001   5.5699   0.0000   0.0000  
+  9  9  73.6182   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3418  
+         1.0000  -0.2000  15.0000   1.0000  -0.1015   5.7850   0.0000   0.0000  
+ 10 10  73.6182   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3418  
+         1.0000  -0.2000  15.0000   1.0000  -0.1015   5.7850   0.0000   0.0000  
+ 11 11  36.2751   0.0000   0.0000   0.8059  -0.3000   0.0000  16.0000   0.1826  
+         0.3414  -0.3000  16.0000   1.0000  -0.0717   7.9108   0.0000   0.0000  
+  3 11 106.8008  67.5543   0.0000   0.0323  -0.3000   1.0000  36.0000   0.1000  
+         0.2670  -0.3402  16.0000   1.0000  -0.1761   4.6698   1.0000   0.0000  
+  2 11   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  1 11   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+ 12 12  66.0677   0.0000   0.0000  -0.9557  -0.2000   0.0000  16.0000   0.2865  
+         0.5847  -0.2000  15.0000   1.0000  -0.0856   5.2857   0.0000   0.0000  
+  3 12 152.2407  57.6204   0.0000  -0.8033  -0.3000   1.0000  36.0000   0.0498  
+         1.8097  -0.3800  16.0000   1.0000  -0.2379   8.0000   1.0000   0.0000  
+  2 12  95.9209   0.0000   0.0000  -0.0153  -0.3000   1.0000  36.0000   0.0100  
+         1.0000  -0.2062   8.6647   1.0000  -0.1911   4.0000   1.0000   0.0000  
+  1 12  78.9091  40.6322   0.0000   0.0040  -0.3000   1.0000  36.0000   0.0384  
+         0.0904  -0.1209  12.3682   1.0000  -0.1613   4.3849   1.0000   0.0000  
+ 13 13  71.3016  10.0000   0.0000  -0.1571  -0.2000   0.0000  16.0000   0.3311  
+         0.1822  -0.2000  15.0000   1.0000  -0.1860   6.5172   0.0000   0.0000  
+  3 13 112.7130  29.8084   0.0000  -0.9010  -0.3000   1.0000  36.0000   0.5508  
+         0.1006  -0.2492  16.9476   1.0000  -0.1919   5.4797   1.0000   0.0000  
+  1 13   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  2 13   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  1 14   0.5356   0.9614   0.0000   0.3817  -0.3000   1.0000  36.0000   0.2142  
+         0.6116  -0.2579   6.1366   1.0000  -0.0913   6.6008   1.0000   0.0000  
+  2 14   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.3027   4.6243   1.0000  -0.4578   3.5219   1.0000   0.0000  
+  3 14 112.7070  10.0000 135.5011   0.9277  -0.2354   1.0000  19.1731   1.2334  
+         0.9822  -0.1837   7.2216   1.0000  -0.1264   6.1257   1.0000   0.0000  
+ 14 14  44.6382   0.0000   0.0000   1.0000  -0.3000   0.0000  16.0000   0.2890  
+         0.3384  -0.3000  16.0000   1.0000  -0.1862   7.4588   0.0000   0.0000  
+ 12 14  50.0000   0.0000   0.0000   0.1000  -0.3000   0.0000  16.0000   0.3000  
+         1.0000  -0.3000  16.0000   1.0000  -0.2000   8.0000   0.0000   0.0000  
+ 20    ! Nr of off-diagonal terms; Ediss;Ro;gamma;rsigma;rpi;rpi2               
+  1  2   0.1239   1.4004   9.8467   1.1210  -1.0000  -1.0000                    
+  2  3   0.0283   1.2885  10.9190   0.9215  -1.0000  -1.0000                    
+  2  4   0.1059   1.8290   9.7818   0.9598  -1.0000  -1.0000                    
+  1  3   0.1156   1.8520   9.8317   1.2854   1.1352   1.0706                    
+  1  4   0.1447   1.8766   9.7990   1.3436   1.1885   1.1363                    
+  3  4   0.1048   2.0003  10.1220   1.3173   1.1096   1.0206                    
+  2  6   0.0470   1.6738  11.6877   1.1931  -1.0000  -1.0000                    
+  3  6   0.1263   1.8163  10.6833   1.6266   1.2052  -1.0000                    
+  1 11   0.1995   2.2133  13.0000   0.0102   1.4868  -1.0000                    
+  2 11   0.1319   1.5855  12.5457   0.0099   1.5065  -1.0000                    
+  3 11   0.0813   1.8649  10.8791   1.6498   1.6445  -1.0000                    
+  1 12   0.4235   1.7716  11.3664   1.8000   1.7212  -1.0000                    
+  2 12   0.0754   1.6033  12.4204   1.6896  -1.5000  -1.0000                    
+  3 12   0.1648   2.1260  11.2425   2.0692   1.6939  -1.0000                    
+  2 13   0.1340   1.8546  11.5784   1.0000  -1.0000  -1.0000                    
+  3 13   0.1280   1.8000  10.5743   1.7358   1.5296  -1.0000                    
+  1 13   0.1301   1.9382  11.1255   0.0100  -1.0000  -1.0000                    
+  1 14   0.1495   2.0794  12.2376   0.0100   1.4060  -1.0000                    
+  2 14   0.0795   1.6794  11.2376   0.0100   1.2060  -1.0000                    
+  3 14   0.2101   2.0342  10.4729   1.6019   1.4781   1.6548                    
+ 97    ! Nr of angles;at1;at2;at3;Thetao,o;ka;kb;pv1;pv2                        
+  1  1  1  59.0573  30.7029   0.7606   0.0000   0.7180   6.2933   1.1244        
+  1  1  2  65.7758  14.5234   6.2481   0.0000   0.5665   0.0000   1.6255        
+  2  1  2  70.2607  25.2202   3.7312   0.0000   0.0050   0.0000   2.7500        
+  1  2  2   0.0000   0.0000   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  2  1   0.0000   3.4110   7.7350   0.0000   0.0000   0.0000   1.0400        
+  2  2  2   0.0000  27.9213   5.8635   0.0000   0.0000   0.0000   1.0400        
+  1  1  3  49.6811   7.1713   4.3889   0.0000   0.7171  10.2661   1.0463        
+  3  1  3  77.7473  40.1718   2.9802 -25.3063   1.6170 -46.1315   2.2503        
+  1  1  4  66.1305  12.4661   7.0000   0.0000   3.0000  50.0000   1.1880        
+  3  1  4  73.9544  12.4661   7.0000   0.0000   3.0000   0.0000   1.1880        
+  4  1  4  64.1581  12.4661   7.0000   0.0000   3.0000   0.0000   1.1880        
+  2  1  3  65.0000  13.8815   5.0583   0.0000   0.4985   0.0000   1.4900        
+  2  1  4  74.2929  31.0883   2.6184   0.0000   0.0755   0.0000   1.0500        
+  1  2  4   0.0000   0.0019   6.3000   0.0000   0.0000   0.0000   1.0400        
+  1  3  1  73.5312  44.7275   0.7354   0.0000   3.0000   0.0000   1.0684        
+  1  3  3  79.4761  36.3701   1.8943   0.0000   0.7351  67.6777   3.0000        
+  1  3  4  82.4890  31.4554   0.9953   0.0000   1.6310   0.0000   1.0783        
+  3  3  3  80.7324  30.4554   0.9953   0.0000   1.6310  50.0000   1.0783        
+  3  3  4  84.3637  31.4554   0.9953   0.0000   1.6310   0.0000   1.0783        
+  4  3  4  89.7071  31.4554   0.9953   0.0000   1.6310   0.0000   1.1519        
+  1  3  2  70.1880  20.9562   0.3864   0.0000   0.0050   0.0000   1.6924        
+  2  3  3  75.6935  50.0000   2.0000   0.0000   1.0000   0.0000   1.1680        
+  2  3  4  75.6201  18.7919   0.9833   0.0000   0.1218   0.0000   1.0500        
+  2  3  2  85.8000   9.8453   2.2720   0.0000   2.8635   0.0000   1.5800        
+  1  4  1  66.0330  22.0295   1.4442   0.0000   1.6777   0.0000   1.0500        
+  1  4  3 103.3204  33.0381   0.5787   0.0000   1.6777   0.0000   1.0500        
+  1  4  4 104.1335   8.6043   1.6495   0.0000   1.6777   0.0000   1.0500        
+  3  4  3  74.1978  42.1786   1.7845 -18.0069   1.6777   0.0000   1.0500        
+  3  4  4  74.8600  43.7354   1.1572  -0.9193   1.6777   0.0000   1.0500        
+  4  4  4  75.0538  14.8267   5.2794   0.0000   1.6777   0.0000   1.0500        
+  1  4  2  69.1106  25.5067   1.1003   0.0000   0.0222   0.0000   1.0369        
+  2  4  3  81.3686  40.0712   2.2396   0.0000   0.0222   0.0000   1.0369        
+  2  4  4  83.0104  43.4766   1.5328   0.0000   0.0222   0.0000   1.0500        
+  2  4  2  70.8687  12.0168   5.0132   0.0000   0.0222   0.0000   1.1243        
+  1  2  3   0.0000  25.0000   3.0000   0.0000   1.0000   0.0000   1.0400        
+  1  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  2  5   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  3  2  3   0.0000  15.0000   2.8900   0.0000   0.0000   0.0000   2.8774        
+  3  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  4  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  2  2  3   0.0000   8.5744   3.0000   0.0000   0.0000   0.0000   1.0421        
+  2  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  1  5  74.9397  25.0560   1.8787   0.1463   0.0559   0.0000   1.0400        
+  1  5  1  86.9521  36.9951   2.0903   0.1463   0.0559   0.0000   1.0400        
+  2  1  5  74.9397  25.0560   1.8787   0.0000   0.0000   0.0000   1.0400        
+  1  5  2  86.1791  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  1  5  5  85.3644  36.9951   2.0903   0.1463   0.0559   0.0000   1.0400        
+  2  5  2  93.1959  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  2  5  5  84.3331  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  6  6  6  69.3456  21.7361   1.4283   0.0000  -0.2101   0.0000   1.3241        
+  2  6  6  75.6168  21.5317   1.0435   0.0000   2.5179   0.0000   1.0400        
+  2  6  2  78.3939  20.9772   0.8630   0.0000   2.8421   0.0000   1.0400        
+  3  6  6  70.3016  15.4081   1.3267   0.0000   2.1459   0.0000   1.0400        
+  2  6  3  73.8232  16.6592   3.7425   0.0000   0.8613   0.0000   1.0400        
+  3  6  3  90.0344   7.7656   1.7264   0.0000   0.7689   0.0000   1.0400        
+  6  3  6  22.1715   3.6615   0.3160   0.0000   4.1125   0.0000   1.0400        
+  2  3  6  83.7634   5.6693   2.7780   0.0000   1.6982   0.0000   1.0400        
+  3  3  6  73.4663  25.0761   0.9143   0.0000   2.2466   0.0000   1.0400        
+  2  2  6   0.0000  47.1300   6.0000   0.0000   1.6371   0.0000   1.0400        
+  6  2  6   0.0000  31.5209   6.0000   0.0000   1.6371   0.0000   1.0400        
+  3  2  6   0.0000  31.0427   4.5625   0.0000   1.6371   0.0000   1.0400        
+  2  2  5   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  3 11  3  62.4906  31.5023   1.3328   0.0000   2.8731   0.0000   1.0794        
+ 11  3 11  31.0790  19.3435   0.4919   0.0000   2.9625   0.0000   3.0000        
+  3  3 11 100.0000  14.7642   7.0000   0.0000   1.0585   0.0000   1.1599        
+  1  3 11  60.7895  13.6681   0.7546   0.0000   2.1747   0.0000   2.9508        
+  2  3 11 100.0000   5.0000   1.4335   0.0000   1.2363   0.0000   5.0000        
+  3 12 12  23.8296   8.9089   7.0000   0.0000   1.0000   0.0000   2.8891        
+  3 12  3  87.0764  19.4489   2.5080   0.0000   2.6056   0.0000   3.0000        
+ 12  3 12  72.7369  13.7522   5.0243   0.0000   2.9700   0.0000   1.5506        
+  3  3 12  68.8771  10.5000   2.5500   0.0000   2.5729   0.0000   1.5892        
+  2  3 12  99.5836   5.4142   2.2105   0.0000   1.0513   0.0000   1.1000        
+  1  3 12  90.0000  12.1772   2.2055   0.0000   1.9064   0.0000   2.6056        
+  1  1 12  71.1708  32.6379   0.4516   0.0000   2.1609   0.0000   1.1000        
+  1 12  3  90.0000  45.0000   0.9335   0.0000   0.2140   0.0000   1.4846        
+  1 12  1  87.6204  45.0000   1.2740   0.0000   1.1519   0.0000   1.1000        
+  3  1 12  54.7020   3.2967   7.0000   0.0000   2.0408   0.0000   2.4032        
+  2 12  3  90.0000  28.2099   1.8036   0.0000   1.5461   0.0000   1.2304        
+  2 12  2  90.0000  36.3001   0.6409   0.0000   3.0000   0.0000   1.7755        
+  1 12  2  89.5835  45.0000   0.8465   0.0000   1.2118   0.0000   2.2282        
+  2  1 12  68.7714  22.9669   0.4631   0.0000   2.4269   0.0000   1.4680        
+  2  2 12   0.0000  30.2898   3.9181   0.0000   0.9914   0.0000   1.3121        
+  3  2 12   0.0000   1.0000   4.1706   0.0000   1.0100   0.0000   1.1000        
+  1  2 12   0.0000   1.0000   3.9722   0.0000   1.0075   0.0000   1.2984        
+  3 13  3  73.6321  10.6453   2.7693   0.0000   0.0500   0.0000   1.9906        
+ 13  3 13 100.0000   5.0270   5.0000   0.0000   1.2768   0.0000   2.0630        
+  3  3 13  52.3127  40.0000   1.1362   0.0000   1.5100   0.0000   1.1000        
+  3 13 13  66.6695   0.0036   3.2646   0.0000   0.0581   0.0000   1.3741        
+  2  3 13 100.0000   3.8927   8.0000   0.0000   2.0000   0.0000   1.1000        
+  1  3 13  96.6040   9.4537   8.0000   0.0000   0.3285   0.0000   4.0000        
+  3 14  3  79.6765  50.0000   1.0502  -0.0016   0.1000   0.0000   1.4583        
+ 14  3 14  20.2100  37.6165   0.6059   0.0000   0.1531   0.0000   2.0586        
+  3  3 14  38.5570  11.9307   0.9911   0.0000   0.8422   0.0000   1.0500        
+  3 14 14   5.8342   0.0724   0.1000   0.0000   0.5490   0.0000   1.7839        
+  2  3 14  81.8943   7.2820   2.1490   0.0000   0.6873   0.0000   3.2184        
+  1  3 14  75.5634   8.3289   1.0236   0.0000   2.0875   0.0000   1.0500        
+ 12  3 14  30.0000   5.0000   0.5000   0.0000   0.5000   0.0000   1.2500        
+ 47    ! Nr of torsions;at1;at2;at3;at4;;V1;V2;V3;V2(BO);vconj;n.u;n            
+  1  1  1  1  -0.2500  34.7453   0.0288  -6.3507  -1.6000   0.0000   0.0000     
+  1  1  1  2  -0.2500  29.2131   0.2945  -4.9581  -2.1802   0.0000   0.0000     
+  2  1  1  2  -0.2500  31.2081   0.4539  -4.8923  -2.2677   0.0000   0.0000     
+  1  1  1  3  -0.3495  22.2142  -0.2959  -2.5000  -1.9066   0.0000   0.0000     
+  2  1  1  3   0.0646  24.3195   0.6259  -3.9603  -1.0000   0.0000   0.0000     
+  3  1  1  3  -0.5456   5.5756   0.8433  -5.1924  -1.0180   0.0000   0.0000     
+  1  1  3  1   1.7555  27.9267   0.0072  -2.6533  -1.0000   0.0000   0.0000     
+  1  1  3  2  -1.4358  36.7830  -1.0000  -8.1821  -1.0000   0.0000   0.0000     
+  2  1  3  1  -1.3959  34.5053   0.7200  -2.5714  -2.1641   0.0000   0.0000     
+  2  1  3  2  -2.5000  70.0597   1.0000  -3.5539  -2.9929   0.0000   0.0000     
+  1  1  3  3   0.6852  11.2819  -0.4784  -2.5000  -2.1085   0.0000   0.0000     
+  2  1  3  3   0.1933  80.0000   1.0000  -4.0590  -3.0000   0.0000   0.0000     
+  3  1  3  1  -1.9889  76.4820  -0.1796  -3.8301  -3.0000   0.0000   0.0000     
+  3  1  3  2   0.2160  72.7707  -0.7087  -4.2100  -3.0000   0.0000   0.0000     
+  3  1  3  3  -2.5000  71.0772   0.2542  -3.1631  -3.0000   0.0000   0.0000     
+  1  3  3  1   2.5000  -0.6002   1.0000  -3.4297  -2.8858   0.0000   0.0000     
+  1  3  3  2  -2.5000  -3.3822   0.7004  -5.4467  -2.9586   0.0000   0.0000     
+  2  3  3  2   2.5000  -4.0000   0.9000  -2.5000  -1.0000   0.0000   0.0000     
+  1  3  3  3   1.2329  -4.0000   1.0000  -2.5000  -1.7479   0.0000   0.0000     
+  2  3  3  3   0.8302  -4.0000  -0.7763  -2.5000  -1.0000   0.0000   0.0000     
+  3  3  3  3  -2.5000  -4.0000   1.0000  -2.5000  -1.0000   0.0000   0.0000     
+  0  1  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  3  0   0.0000   0.1000   0.0200  -2.5415   0.0000   0.0000   0.0000     
+  0  1  1  0   0.0000  50.0000   0.3000  -4.0000  -2.0000   0.0000   0.0000     
+  0  3  3  0   0.5511  25.4150   1.1330  -5.1903  -1.0000   0.0000   0.0000     
+  0  1  4  0  -2.4242 128.1636   0.3739  -6.6098  -2.0000   0.0000   0.0000     
+  0  2  4  0   0.0000   0.1000   0.0200  -2.5415   0.0000   0.0000   0.0000     
+  0  3  4  0   1.4816  55.6641   0.0004  -7.0465  -2.7203   0.0000   0.0000     
+  0  4  4  0  -0.3244  27.7086   0.0039  -2.8272  -2.0000   0.0000   0.0000     
+  4  1  4  4  -5.5181   8.9706   0.0004  -6.1782  -2.0000   0.0000   0.0000     
+  0  1  5  0   3.3423  30.3435   0.0365  -2.7171   0.0000   0.0000   0.0000     
+  0  5  5  0  -0.0555 -42.7738   0.1515  -2.2056   0.0000   0.0000   0.0000     
+  0  2  5  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  6  6  0   0.0000   0.0000   0.1200  -2.4426   0.0000   0.0000   0.0000     
+  0  2  6  0   0.0000   0.0000   0.1200  -2.4847   0.0000   0.0000   0.0000     
+  0  3  6  0   0.0000   0.0000   0.1200  -2.4703   0.0000   0.0000   0.0000     
+  2  1  3 14   1.6297  56.8132   0.3398  -2.6912  -2.1000   0.0000   0.0000     
+  1  1  3 14  -0.0427  13.4096   0.9351  -6.5245  -2.1000   0.0000   0.0000     
+  2  3 14  3   2.5000  11.6208   1.0000  -9.0000  -1.0000   0.0000   0.0000     
+  2  1  3 12  -0.2500  45.7639   0.3000  -3.5745  -2.1565   0.0000   0.0000     
+  1  1  3 12  -0.2500  69.1094   0.3000  -3.0983  -2.1565   0.0000   0.0000     
+  2  3 12  3  -0.4306   7.5000  -0.5000  -6.9948  -1.0000   0.0000   0.0000     
+  2  3 11  3   1.8627   9.7180  -1.0000  -7.2224  -1.0000   0.0000   0.0000     
+  1  3 11  3   2.5000  23.9443   1.0000  -3.2267  -1.0000   0.0000   0.0000     
+  1  1  3 11   0.9114  62.5039  -0.2389  -3.2976  -1.0000   0.0000   0.0000     
+  2  1  3 11   0.5000  35.0000   0.5000  -4.0000  -1.0000   0.0000   0.0000     
+  9    ! Nr of hydrogen bonds;at1;at2;at3;Rhb;Dehb;vhb1                         
+  3  2  3   2.1200  -3.5800   1.4500  19.5000                                   
+  3  2  4   2.0000  -6.0000   1.7976   3.0000                                   
+  4  2  3   1.2000  -2.0000   1.7976   3.0000                                   
+  4  2  4   1.2979  -6.0000   1.7976   3.0000                                   
+  3  2  5   1.5000  -2.0000   1.7976   3.0000                                   
+  4  2  5   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  3   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  4   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  5   1.5000  -2.0000   1.7976   3.0000                                   
diff --git a/data/benchmarks/water/ffield_acks2_300.water b/data/benchmarks/water/ffield_acks2_300.water
new file mode 100644
index 0000000000000000000000000000000000000000..d4d7b7b94777b6c456acb5bc1ad1e40dcfaa4da0
--- /dev/null
+++ b/data/benchmarks/water/ffield_acks2_300.water
@@ -0,0 +1,364 @@
+Reactive MD-force field: Water                                                  
+ 39       ! Number of general parameters                                        
+   50.0000 !Overcoordination parameter                                          
+    9.5469 !Overcoordination parameter                                          
+   26.5405 !Valency angle conjugation parameter                                 
+    1.7224 !Triple bond stabilisation parameter                                 
+    6.8702 !Triple bond stabilisation parameter                                 
+   60.4850 !C2-correction                                                       
+    1.0588 !Undercoordination parameter                                         
+    4.6000 !Triple bond stabilisation parameter                                 
+   12.1176 !Undercoordination parameter                                         
+   13.3056 !Undercoordination parameter                                         
+  -70.5044 !Triple bond stabilization energy                                    
+    0.0000 !Lower Taper-radius                                                  
+   10.0000 !Upper Taper-radius                                                  
+    2.8793 !Not used                                                            
+   33.8667 !Valency undercoordination                                           
+    6.0891 !Valency angle/lone pair parameter                                   
+    1.0563 !Valency angle                                                       
+    2.0384 !Valency angle parameter                                             
+    6.1431 !Not used                                                            
+    6.9290 !Double bond/angle parameter                                         
+    0.3989 !Double bond/angle parameter: overcoord                              
+    3.9954 !Double bond/angle parameter: overcoord                              
+   -2.4837 !Not used                                                            
+    5.7796 !Torsion/BO parameter                                                
+   10.0000 !Torsion overcoordination                                            
+    1.9487 !Torsion overcoordination                                            
+   -1.2327 !Conjugation 0 (not used)                                            
+    2.1645 !Conjugation                                                         
+    1.5591 !vdWaals shielding                                                   
+    0.1000 !Cutoff for bond order (*100)                                        
+    2.1365 !Valency angle conjugation parameter                                 
+    0.6991 !Overcoordination parameter                                          
+   50.0000 !Overcoordination parameter                                          
+    1.8512 !Valency/lone pair parameter                                         
+  548.6451 !Softness                                                            
+   20.0000 !Not used                                                            
+    5.0000 !Molecular energy (not used)                                         
+    0.0000 !Molecular energy (not used)                                         
+    2.6962 !Valency angle conjugation parameter                                 
+ 15    ! Nr of atoms; cov.r; valency;a.m;Rvdw;Evdw;gammaEEM;cov.r2;#            
+            alfa;gammavdW;valency;Eunder;Eover;chiEEM;etaEEM;n.u.               
+            cov r3;Elp;Heat inc.;n.u.;n.u.;n.u.;n.u.                            
+            ov/un;val1;n.u.;val3,vval4                                          
+ C    1.3817   4.0000  12.0000   1.8903   0.1838   0.9000   1.1341   4.0000     
+      9.7559   2.1346   4.0000  34.9350  79.5548   5.9666   7.0000   0.0000     
+      1.2114   0.0000 202.5551   8.9539  34.9289  13.5366   0.8563   0.0000     
+     -2.8983   2.5000   1.0564   4.0000   2.9663   0.0000   0.0000   0.0000     
+ H    0.8873   1.0000   1.0080   1.5420   0.0598   0.6883  -0.1000   1.0000     
+      8.1910  30.9706   1.0000   0.0000 121.1250   3.5768  10.5896   1.0000     
+     -0.1000   0.0000  61.6606   1.3986   2.1457   0.0003   3.4114   0.0000     
+    -15.7683   2.1488   1.0338   1.0000   2.8793   0.0000   0.0000   0.0000     
+ O    1.2450   2.0000  15.9990   2.3878   0.1023   1.0903   1.0548   6.0000     
+     10.5750  32.3923   4.0000  37.5000 116.0768   8.5000   7.5600   2.0000     
+      0.9049  -1.0100  59.0626   2.7162   3.2532   0.0021   0.9745   0.0000     
+     -3.6141   2.7025   1.0493   4.0000   2.9225   0.0000   5.4479   0.0000     
+ N    1.2333   3.0000  14.0000   1.9324   0.1376   0.8596   1.1748   5.0000     
+     10.0667   7.8431   4.0000  32.2482 100.0000   6.8418   6.3404   2.0000     
+      1.0433  13.7673 119.9837   2.1961   3.0696   2.7683   0.9745   0.0000     
+     -4.3875   2.6192   1.0183   4.0000   2.8793   0.0000   0.0000   0.0000     
+ S    1.9405   2.0000  32.0600   2.0677   0.2099   1.0336   1.5479   6.0000     
+      9.9575   4.9055   4.0000  52.9998 112.1416   6.5000   8.2545   2.0000     
+      1.4601   9.7177  71.1843   5.7487  23.2859  12.7147   0.9745   0.0000     
+    -11.0000   2.7466   1.0338   6.2998   2.8793   0.0000   0.0000   0.0000     
+ Si   2.0276   4.0000  28.0600   2.2042   0.1322   0.8218   1.5758   4.0000     
+     11.9413   2.0618   4.0000  11.8211 136.4845   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 126.5182   6.4918   8.5961   0.2368   0.8563   0.0000     
+     -3.8112   3.1873   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Pt   1.9907   3.0000 195.0800   1.9980   0.2452   0.8218  -1.0000   3.0000     
+     12.8669   3.2118   3.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 142.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -6.7740   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Zr   2.1000   4.0000  91.2240   2.1970   0.2542   0.8218  -1.0000   4.0000     
+     12.8545   3.5938   4.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000 107.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Ni   1.8503   2.0000  58.6900   1.9219   0.1582   0.8218  -1.0000   2.0000     
+     12.1238   4.0351   2.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000  95.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ Au   1.8503   1.0000 196.9665   1.9219   0.1582   0.8218  -1.0000   1.0000     
+     12.1238   4.0351   1.0000   0.0000   0.0000   1.8038   7.3852   0.0000     
+     -1.0000   0.0000  72.6300   6.2293   5.2294   0.1542   0.8563   0.0000     
+     -3.2224   2.9867   1.0338   6.2998   2.5791   0.0000   0.0000   0.0000     
+ V    2.2657   3.0000  50.9415   1.7992   0.3005   0.6743   0.1000   5.0000     
+     12.3879   5.2243   3.0000   0.0000   0.0000  -0.3628   6.6023   0.0000     
+     -1.0000   0.0000 117.6300  23.1946   6.5795   0.0000   0.8563   0.0000     
+     -3.5389   1.5012   1.0338   3.0000   3.6411   0.0000   0.0000   0.0000     
+ Bi   2.1949   3.0000 208.9804   2.4429   0.1607   0.4960   0.0535   5.0000     
+     12.9571  35.5167   3.0000   0.0000   0.0000  -0.1926   6.4153   0.0000     
+     -1.0000   0.5785  52.6300   3.8978   0.9856   0.0314   0.8563   0.0000     
+     -2.5000   5.0597   1.0338   6.0000   2.5791   0.0000   0.0000   0.0000     
+ Ti   0.1000   4.0000  47.8800   2.0000   0.1659   0.6037   0.1000   4.0000     
+     13.2535   4.0063   4.0000  -5.0000   0.0000  -0.1864   5.9304   0.0000     
+     -1.0000   0.0000 129.6300  22.8461   1.8515   0.0064   0.8563   0.0000     
+     -3.4122   3.2711   1.0338   6.2998   2.2632   0.0000   0.0000   0.0000     
+ Mo   2.4710   5.6504  95.9400   1.8000   0.3285   1.0000   0.1000   6.0000     
+     13.0000  45.0000   4.0000   0.0000   0.0000   0.6062   6.1484   0.0000     
+      0.1000   0.0000 152.6300   3.7659   0.0689   2.9902   0.8563   0.0000     
+    -16.7660   3.1072   1.0338   8.0000   3.4590   0.0000   0.0000   0.0000     
+ X   -0.1000   2.0000   1.0080   2.0000   0.0000   1.0000  -0.1000   6.0000     
+     10.0000   2.5000   4.0000   0.0000   0.0000   8.5000   1.5000   0.0000     
+     -0.1000   0.0000  -2.3700   8.7410  13.3640   0.6690   0.9745   0.0000     
+    -11.0000   2.7466   1.0338   2.0000   2.8793   0.0000   0.0000   0.0000     
+ 40      ! Nr of bonds; Edis1;LPpen;n.u.;pbe1;pbo5;13corr;pbo6                  
+                         pbe2;pbo3;pbo4;n.u.;pbo1;pbo2;ovcorr                   
+  1  1 158.2004  99.1897  78.0000  -0.7738  -0.4550   1.0000  37.6117   0.4147  
+         0.4590  -0.1000   9.1628   1.0000  -0.0777   6.7268   1.0000   0.0000  
+  1  2 169.4760   0.0000   0.0000  -0.6083   0.0000   1.0000   6.0000   0.7652  
+         5.2290   1.0000   0.0000   1.0000  -0.0500   6.9136   0.0000   0.0000  
+  2  2 166.8880   0.0000   0.0000  -0.2191   0.0000   1.0000   6.0000   1.0000  
+         6.1152   1.0000   0.0000   1.0000  -0.0889   6.0000   0.0000   0.0000  
+  1  3 158.6946 107.4583  23.3136  -0.4240  -0.1743   1.0000  10.8209   1.0000  
+         0.5322  -0.3113   7.0000   1.0000  -0.1447   5.2450   0.0000   0.0000  
+  3  3 200.0000 230.7321  50.8293   0.2506  -0.1000   1.0000  29.7503   0.6051  
+         0.3451  -0.1055   9.0000   1.0000  -0.1225   5.5000   1.0000   0.0000  
+  1  4 134.1215 140.2179  79.9745   0.0163  -0.1428   1.0000  27.0617   0.2000  
+         0.1387  -0.3681   7.1611   1.0000  -0.1000   5.0825   1.0000   0.0000  
+  3  4 130.8596 169.4551  40.0000   0.3837  -0.1639   1.0000  35.0000   0.2000  
+         1.0000  -0.3579   7.0004   1.0000  -0.1193   6.8773   1.0000   0.0000  
+  4  4 157.9384  82.5526 152.5336   0.4010  -0.1034   1.0000  12.4261   0.5828  
+         0.1578  -0.1509  11.9186   1.0000  -0.0861   5.4271   1.0000   0.0000  
+  2  3 163.1043   0.0000   0.0000  -0.4155   0.0000   1.0000   6.0000   0.3607  
+         1.9380   1.0000   0.0000   0.0000  -0.0778   4.3082   0.0000   0.0000  
+  2  4 231.8173   0.0000   0.0000  -0.3364   0.0000   1.0000   6.0000   0.4402  
+         8.8910   1.0000   0.0000   1.0000  -0.0327   6.5754   0.0000   0.0000  
+  1  5 128.9942  74.5848  55.2528   0.1035  -0.5211   1.0000  18.9617   0.6000  
+         0.2949  -0.2398   8.1175   1.0000  -0.1029   5.6731   1.0000   0.0000  
+  2  5 151.5159   0.0000   0.0000  -0.4721   0.0000   1.0000   6.0000   0.6000  
+         9.4366   1.0000   0.0000   1.0000  -0.0290   7.0050   1.0000   0.0000  
+  3  5   0.0000   0.0000   0.0000   0.5563  -0.4038   1.0000  49.5611   0.6000  
+         0.4259  -0.4577  12.7569   1.0000  -0.1100   7.1145   1.0000   0.0000  
+  4  5   0.0000   0.0000   0.0000   0.4438  -0.2034   1.0000  40.3399   0.6000  
+         0.3296  -0.3153   9.1227   1.0000  -0.1805   5.6864   1.0000   0.0000  
+  5  5  96.1871  93.7006  68.6860   0.0955  -0.4781   1.0000  17.8574   0.6000  
+         0.2723  -0.2373   9.7875   1.0000  -0.0950   6.4757   1.0000   0.0000  
+  6  6 109.1904  70.8314  30.0000   0.2765  -0.3000   1.0000  16.0000   0.1583  
+         0.2804  -0.1994   8.1117   1.0000  -0.0675   8.2993   0.0000   0.0000  
+  2  6 137.1002   0.0000   0.0000  -0.1902   0.0000   1.0000   6.0000   0.4256  
+        17.7186   1.0000   0.0000   1.0000  -0.0377   6.4281   0.0000   0.0000  
+  3  6 191.1743  52.0733  43.3991  -0.2584  -0.3000   1.0000  36.0000   0.8764  
+         1.0248  -0.3658   4.2151   1.0000  -0.5004   4.2605   1.0000   0.0000  
+  4  6 185.4488  39.2832  43.3991  -0.1922  -0.3000   1.0000  36.0000   0.8217  
+         0.8538  -0.3887   4.4334   1.0000  -0.5241   4.4529   1.0000   0.0000  
+  7  7  90.1462   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3484  
+         1.0000  -0.2000  15.0000   1.0000  -0.1014   5.7631   0.0000   0.0000  
+  8  8  85.2900   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.5438  
+         1.0000  -0.2000  15.0000   1.0000  -0.1001   5.5699   0.0000   0.0000  
+  9  9  73.6182   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3418  
+         1.0000  -0.2000  15.0000   1.0000  -0.1015   5.7850   0.0000   0.0000  
+ 10 10  73.6182   0.0000   0.0000   0.0004  -0.2000   0.0000  16.0000   0.3418  
+         1.0000  -0.2000  15.0000   1.0000  -0.1015   5.7850   0.0000   0.0000  
+ 11 11  36.2751   0.0000   0.0000   0.8059  -0.3000   0.0000  16.0000   0.1826  
+         0.3414  -0.3000  16.0000   1.0000  -0.0717   7.9108   0.0000   0.0000  
+  3 11 106.8008  67.5543   0.0000   0.0323  -0.3000   1.0000  36.0000   0.1000  
+         0.2670  -0.3402  16.0000   1.0000  -0.1761   4.6698   1.0000   0.0000  
+  2 11   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  1 11   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+ 12 12  66.0677   0.0000   0.0000  -0.9557  -0.2000   0.0000  16.0000   0.2865  
+         0.5847  -0.2000  15.0000   1.0000  -0.0856   5.2857   0.0000   0.0000  
+  3 12 152.2407  57.6204   0.0000  -0.8033  -0.3000   1.0000  36.0000   0.0498  
+         1.8097  -0.3800  16.0000   1.0000  -0.2379   8.0000   1.0000   0.0000  
+  2 12  95.9209   0.0000   0.0000  -0.0153  -0.3000   1.0000  36.0000   0.0100  
+         1.0000  -0.2062   8.6647   1.0000  -0.1911   4.0000   1.0000   0.0000  
+  1 12  78.9091  40.6322   0.0000   0.0040  -0.3000   1.0000  36.0000   0.0384  
+         0.0904  -0.1209  12.3682   1.0000  -0.1613   4.3849   1.0000   0.0000  
+ 13 13  71.3016  10.0000   0.0000  -0.1571  -0.2000   0.0000  16.0000   0.3311  
+         0.1822  -0.2000  15.0000   1.0000  -0.1860   6.5172   0.0000   0.0000  
+  3 13 112.7130  29.8084   0.0000  -0.9010  -0.3000   1.0000  36.0000   0.5508  
+         0.1006  -0.2492  16.9476   1.0000  -0.1919   5.4797   1.0000   0.0000  
+  1 13   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  2 13   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.2500  20.0000   1.0000  -0.2578   6.5219   1.0000   0.0000  
+  1 14   0.5356   0.9614   0.0000   0.3817  -0.3000   1.0000  36.0000   0.2142  
+         0.6116  -0.2579   6.1366   1.0000  -0.0913   6.6008   1.0000   0.0000  
+  2 14   0.0000   0.0000   0.0000  -0.2872  -0.3000   1.0000  36.0000   0.0082  
+         1.7973  -0.3027   4.6243   1.0000  -0.4578   3.5219   1.0000   0.0000  
+  3 14 112.7070  10.0000 135.5011   0.9277  -0.2354   1.0000  19.1731   1.2334  
+         0.9822  -0.1837   7.2216   1.0000  -0.1264   6.1257   1.0000   0.0000  
+ 14 14  44.6382   0.0000   0.0000   1.0000  -0.3000   0.0000  16.0000   0.2890  
+         0.3384  -0.3000  16.0000   1.0000  -0.1862   7.4588   0.0000   0.0000  
+ 12 14  50.0000   0.0000   0.0000   0.1000  -0.3000   0.0000  16.0000   0.3000  
+         1.0000  -0.3000  16.0000   1.0000  -0.2000   8.0000   0.0000   0.0000  
+ 20    ! Nr of off-diagonal terms; Ediss;Ro;gamma;rsigma;rpi;rpi2               
+  1  2   0.1239   1.4004   9.8467   1.1210  -1.0000  -1.0000                    
+  2  3   0.0299   1.3153  10.9102   0.9093  -1.0000  -1.0000                    
+  2  4   0.1059   1.8290   9.7818   0.9598  -1.0000  -1.0000                    
+  1  3   0.1156   1.8520   9.8317   1.2854   1.1352   1.0706                    
+  1  4   0.1447   1.8766   9.7990   1.3436   1.1885   1.1363                    
+  3  4   0.1048   2.0003  10.1220   1.3173   1.1096   1.0206                    
+  2  6   0.0470   1.6738  11.6877   1.1931  -1.0000  -1.0000                    
+  3  6   0.1263   1.8163  10.6833   1.6266   1.2052  -1.0000                    
+  1 11   0.1995   2.2133  13.0000   0.0102   1.4868  -1.0000                    
+  2 11   0.1319   1.5855  12.5457   0.0099   1.5065  -1.0000                    
+  3 11   0.0813   1.8649  10.8791   1.6498   1.6445  -1.0000                    
+  1 12   0.4235   1.7716  11.3664   1.8000   1.7212  -1.0000                    
+  2 12   0.0754   1.6033  12.4204   1.6896  -1.5000  -1.0000                    
+  3 12   0.1648   2.1260  11.2425   2.0692   1.6939  -1.0000                    
+  2 13   0.1340   1.8546  11.5784   1.0000  -1.0000  -1.0000                    
+  3 13   0.1280   1.8000  10.5743   1.7358   1.5296  -1.0000                    
+  1 13   0.1301   1.9382  11.1255   0.0100  -1.0000  -1.0000                    
+  1 14   0.1495   2.0794  12.2376   0.0100   1.4060  -1.0000                    
+  2 14   0.0795   1.6794  11.2376   0.0100   1.2060  -1.0000                    
+  3 14   0.2101   2.0342  10.4729   1.6019   1.4781   1.6548                    
+ 97    ! Nr of angles;at1;at2;at3;Thetao,o;ka;kb;pv1;pv2                        
+  1  1  1  59.0573  30.7029   0.7606   0.0000   0.7180   6.2933   1.1244        
+  1  1  2  65.7758  14.5234   6.2481   0.0000   0.5665   0.0000   1.6255        
+  2  1  2  70.2607  25.2202   3.7312   0.0000   0.0050   0.0000   2.7500        
+  1  2  2   0.0000   0.0000   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  2  1   0.0000   3.4110   7.7350   0.0000   0.0000   0.0000   1.0400        
+  2  2  2   0.0000  27.9213   5.8635   0.0000   0.0000   0.0000   1.0400        
+  1  1  3  49.6811   7.1713   4.3889   0.0000   0.7171  10.2661   1.0463        
+  3  1  3  77.7473  40.1718   2.9802 -25.3063   1.6170 -46.1315   2.2503        
+  1  1  4  66.1305  12.4661   7.0000   0.0000   3.0000  50.0000   1.1880        
+  3  1  4  73.9544  12.4661   7.0000   0.0000   3.0000   0.0000   1.1880        
+  4  1  4  64.1581  12.4661   7.0000   0.0000   3.0000   0.0000   1.1880        
+  2  1  3  65.0000  13.8815   5.0583   0.0000   0.4985   0.0000   1.4900        
+  2  1  4  74.2929  31.0883   2.6184   0.0000   0.0755   0.0000   1.0500        
+  1  2  4   0.0000   0.0019   6.3000   0.0000   0.0000   0.0000   1.0400        
+  1  3  1  73.5312  44.7275   0.7354   0.0000   3.0000   0.0000   1.0684        
+  1  3  3  79.4761  36.3701   1.8943   0.0000   0.7351  67.6777   3.0000        
+  1  3  4  82.4890  31.4554   0.9953   0.0000   1.6310   0.0000   1.0783        
+  3  3  3  80.7324  30.4554   0.9953   0.0000   1.6310  50.0000   1.0783        
+  3  3  4  84.3637  31.4554   0.9953   0.0000   1.6310   0.0000   1.0783        
+  4  3  4  89.7071  31.4554   0.9953   0.0000   1.6310   0.0000   1.1519        
+  1  3  2  70.1880  20.9562   0.3864   0.0000   0.0050   0.0000   1.6924        
+  2  3  3  75.6935  25.0000   2.0000   0.0000   1.0000   0.0000   1.1680        
+  2  3  4  75.6201  18.7919   0.9833   0.0000   0.1218   0.0000   1.0500        
+  2  3  2  77.3619   4.8342   7.1628   0.0000   2.9933   0.0000   1.5948        
+  1  4  1  66.0330  22.0295   1.4442   0.0000   1.6777   0.0000   1.0500        
+  1  4  3 103.3204  33.0381   0.5787   0.0000   1.6777   0.0000   1.0500        
+  1  4  4 104.1335   8.6043   1.6495   0.0000   1.6777   0.0000   1.0500        
+  3  4  3  74.1978  42.1786   1.7845 -18.0069   1.6777   0.0000   1.0500        
+  3  4  4  74.8600  43.7354   1.1572  -0.9193   1.6777   0.0000   1.0500        
+  4  4  4  75.0538  14.8267   5.2794   0.0000   1.6777   0.0000   1.0500        
+  1  4  2  69.1106  25.5067   1.1003   0.0000   0.0222   0.0000   1.0369        
+  2  4  3  81.3686  40.0712   2.2396   0.0000   0.0222   0.0000   1.0369        
+  2  4  4  83.0104  43.4766   1.5328   0.0000   0.0222   0.0000   1.0500        
+  2  4  2  70.8687  12.0168   5.0132   0.0000   0.0222   0.0000   1.1243        
+  1  2  3   0.0000  25.0000   3.0000   0.0000   1.0000   0.0000   1.0400        
+  1  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  2  5   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  3  2  3   0.0000   4.4124   2.5758   0.0000   0.0000   0.0000   2.9884        
+  3  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  4  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  2  2  3   0.0000  15.0000   2.2970   0.0000   0.0000   0.0000   1.3268        
+  2  2  4   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  1  1  5  74.9397  25.0560   1.8787   0.1463   0.0559   0.0000   1.0400        
+  1  5  1  86.9521  36.9951   2.0903   0.1463   0.0559   0.0000   1.0400        
+  2  1  5  74.9397  25.0560   1.8787   0.0000   0.0000   0.0000   1.0400        
+  1  5  2  86.1791  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  1  5  5  85.3644  36.9951   2.0903   0.1463   0.0559   0.0000   1.0400        
+  2  5  2  93.1959  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  2  5  5  84.3331  36.9951   2.0903   0.0000   0.0000   0.0000   1.0400        
+  6  6  6  69.3456  21.7361   1.4283   0.0000  -0.2101   0.0000   1.3241        
+  2  6  6  75.6168  21.5317   1.0435   0.0000   2.5179   0.0000   1.0400        
+  2  6  2  78.3939  20.9772   0.8630   0.0000   2.8421   0.0000   1.0400        
+  3  6  6  70.3016  15.4081   1.3267   0.0000   2.1459   0.0000   1.0400        
+  2  6  3  73.8232  16.6592   3.7425   0.0000   0.8613   0.0000   1.0400        
+  3  6  3  90.0344   7.7656   1.7264   0.0000   0.7689   0.0000   1.0400        
+  6  3  6  22.1715   3.6615   0.3160   0.0000   4.1125   0.0000   1.0400        
+  2  3  6  83.7634   5.6693   2.7780   0.0000   1.6982   0.0000   1.0400        
+  3  3  6  73.4663  25.0761   0.9143   0.0000   2.2466   0.0000   1.0400        
+  2  2  6   0.0000  47.1300   6.0000   0.0000   1.6371   0.0000   1.0400        
+  6  2  6   0.0000  31.5209   6.0000   0.0000   1.6371   0.0000   1.0400        
+  3  2  6   0.0000  31.0427   4.5625   0.0000   1.6371   0.0000   1.0400        
+  2  2  5   0.0000   0.0019   6.0000   0.0000   0.0000   0.0000   1.0400        
+  3 11  3  62.4906  31.5023   1.3328   0.0000   2.8731   0.0000   1.0794        
+ 11  3 11  31.0790  19.3435   0.4919   0.0000   2.9625   0.0000   3.0000        
+  3  3 11 100.0000  14.7642   7.0000   0.0000   1.0585   0.0000   1.1599        
+  1  3 11  60.7895  13.6681   0.7546   0.0000   2.1747   0.0000   2.9508        
+  2  3 11 100.0000   5.0000   1.4335   0.0000   1.2363   0.0000   5.0000        
+  3 12 12  23.8296   8.9089   7.0000   0.0000   1.0000   0.0000   2.8891        
+  3 12  3  87.0764  19.4489   2.5080   0.0000   2.6056   0.0000   3.0000        
+ 12  3 12  72.7369  13.7522   5.0243   0.0000   2.9700   0.0000   1.5506        
+  3  3 12  68.8771  10.5000   2.5500   0.0000   2.5729   0.0000   1.5892        
+  2  3 12  99.5836   5.4142   2.2105   0.0000   1.0513   0.0000   1.1000        
+  1  3 12  90.0000  12.1772   2.2055   0.0000   1.9064   0.0000   2.6056        
+  1  1 12  71.1708  32.6379   0.4516   0.0000   2.1609   0.0000   1.1000        
+  1 12  3  90.0000  45.0000   0.9335   0.0000   0.2140   0.0000   1.4846        
+  1 12  1  87.6204  45.0000   1.2740   0.0000   1.1519   0.0000   1.1000        
+  3  1 12  54.7020   3.2967   7.0000   0.0000   2.0408   0.0000   2.4032        
+  2 12  3  90.0000  28.2099   1.8036   0.0000   1.5461   0.0000   1.2304        
+  2 12  2  90.0000  36.3001   0.6409   0.0000   3.0000   0.0000   1.7755        
+  1 12  2  89.5835  45.0000   0.8465   0.0000   1.2118   0.0000   2.2282        
+  2  1 12  68.7714  22.9669   0.4631   0.0000   2.4269   0.0000   1.4680        
+  2  2 12   0.0000  30.2898   3.9181   0.0000   0.9914   0.0000   1.3121        
+  3  2 12   0.0000   1.0000   4.1706   0.0000   1.0100   0.0000   1.1000        
+  1  2 12   0.0000   1.0000   3.9722   0.0000   1.0075   0.0000   1.2984        
+  3 13  3  73.6321  10.6453   2.7693   0.0000   0.0500   0.0000   1.9906        
+ 13  3 13 100.0000   5.0270   5.0000   0.0000   1.2768   0.0000   2.0630        
+  3  3 13  52.3127  40.0000   1.1362   0.0000   1.5100   0.0000   1.1000        
+  3 13 13  66.6695   0.0036   3.2646   0.0000   0.0581   0.0000   1.3741        
+  2  3 13 100.0000   3.8927   8.0000   0.0000   2.0000   0.0000   1.1000        
+  1  3 13  96.6040   9.4537   8.0000   0.0000   0.3285   0.0000   4.0000        
+  3 14  3  79.6765  50.0000   1.0502  -0.0016   0.1000   0.0000   1.4583        
+ 14  3 14  20.2100  37.6165   0.6059   0.0000   0.1531   0.0000   2.0586        
+  3  3 14  38.5570  11.9307   0.9911   0.0000   0.8422   0.0000   1.0500        
+  3 14 14   5.8342   0.0724   0.1000   0.0000   0.5490   0.0000   1.7839        
+  2  3 14  81.8943   7.2820   2.1490   0.0000   0.6873   0.0000   3.2184        
+  1  3 14  75.5634   8.3289   1.0236   0.0000   2.0875   0.0000   1.0500        
+ 12  3 14  30.0000   5.0000   0.5000   0.0000   0.5000   0.0000   1.2500        
+ 47    ! Nr of torsions;at1;at2;at3;at4;;V1;V2;V3;V2(BO);vconj;n.u;n            
+  1  1  1  1  -0.2500  34.7453   0.0288  -6.3507  -1.6000   0.0000   0.0000     
+  1  1  1  2  -0.2500  29.2131   0.2945  -4.9581  -2.1802   0.0000   0.0000     
+  2  1  1  2  -0.2500  31.2081   0.4539  -4.8923  -2.2677   0.0000   0.0000     
+  1  1  1  3  -0.3495  22.2142  -0.2959  -2.5000  -1.9066   0.0000   0.0000     
+  2  1  1  3   0.0646  24.3195   0.6259  -3.9603  -1.0000   0.0000   0.0000     
+  3  1  1  3  -0.5456   5.5756   0.8433  -5.1924  -1.0180   0.0000   0.0000     
+  1  1  3  1   1.7555  27.9267   0.0072  -2.6533  -1.0000   0.0000   0.0000     
+  1  1  3  2  -1.4358  36.7830  -1.0000  -8.1821  -1.0000   0.0000   0.0000     
+  2  1  3  1  -1.3959  34.5053   0.7200  -2.5714  -2.1641   0.0000   0.0000     
+  2  1  3  2  -2.5000  70.0597   1.0000  -3.5539  -2.9929   0.0000   0.0000     
+  1  1  3  3   0.6852  11.2819  -0.4784  -2.5000  -2.1085   0.0000   0.0000     
+  2  1  3  3   0.1933  80.0000   1.0000  -4.0590  -3.0000   0.0000   0.0000     
+  3  1  3  1  -1.9889  76.4820  -0.1796  -3.8301  -3.0000   0.0000   0.0000     
+  3  1  3  2   0.2160  72.7707  -0.7087  -4.2100  -3.0000   0.0000   0.0000     
+  3  1  3  3  -2.5000  71.0772   0.2542  -3.1631  -3.0000   0.0000   0.0000     
+  1  3  3  1   2.5000  -0.6002   1.0000  -3.4297  -2.8858   0.0000   0.0000     
+  1  3  3  2  -2.5000  -3.3822   0.7004  -5.4467  -2.9586   0.0000   0.0000     
+  2  3  3  2   2.5000  -4.0000   0.9000  -2.5000  -1.0000   0.0000   0.0000     
+  1  3  3  3   1.2329  -4.0000   1.0000  -2.5000  -1.7479   0.0000   0.0000     
+  2  3  3  3   0.8302  -4.0000  -0.7763  -2.5000  -1.0000   0.0000   0.0000     
+  3  3  3  3  -2.5000  -4.0000   1.0000  -2.5000  -1.0000   0.0000   0.0000     
+  0  1  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  2  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  2  3  0   0.0000   0.1000   0.0200  -2.5415   0.0000   0.0000   0.0000     
+  0  1  1  0   0.0000  50.0000   0.3000  -4.0000  -2.0000   0.0000   0.0000     
+  0  3  3  0   0.5511  25.4150   1.1330  -5.1903  -1.0000   0.0000   0.0000     
+  0  1  4  0  -2.4242 128.1636   0.3739  -6.6098  -2.0000   0.0000   0.0000     
+  0  2  4  0   0.0000   0.1000   0.0200  -2.5415   0.0000   0.0000   0.0000     
+  0  3  4  0   1.4816  55.6641   0.0004  -7.0465  -2.7203   0.0000   0.0000     
+  0  4  4  0  -0.3244  27.7086   0.0039  -2.8272  -2.0000   0.0000   0.0000     
+  4  1  4  4  -5.5181   8.9706   0.0004  -6.1782  -2.0000   0.0000   0.0000     
+  0  1  5  0   3.3423  30.3435   0.0365  -2.7171   0.0000   0.0000   0.0000     
+  0  5  5  0  -0.0555 -42.7738   0.1515  -2.2056   0.0000   0.0000   0.0000     
+  0  2  5  0   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000   0.0000     
+  0  6  6  0   0.0000   0.0000   0.1200  -2.4426   0.0000   0.0000   0.0000     
+  0  2  6  0   0.0000   0.0000   0.1200  -2.4847   0.0000   0.0000   0.0000     
+  0  3  6  0   0.0000   0.0000   0.1200  -2.4703   0.0000   0.0000   0.0000     
+  2  1  3 14   1.6297  56.8132   0.3398  -2.6912  -2.1000   0.0000   0.0000     
+  1  1  3 14  -0.0427  13.4096   0.9351  -6.5245  -2.1000   0.0000   0.0000     
+  2  3 14  3   2.5000  11.6208   1.0000  -9.0000  -1.0000   0.0000   0.0000     
+  2  1  3 12  -0.2500  45.7639   0.3000  -3.5745  -2.1565   0.0000   0.0000     
+  1  1  3 12  -0.2500  69.1094   0.3000  -3.0983  -2.1565   0.0000   0.0000     
+  2  3 12  3  -0.4306   7.5000  -0.5000  -6.9948  -1.0000   0.0000   0.0000     
+  2  3 11  3   1.8627   9.7180  -1.0000  -7.2224  -1.0000   0.0000   0.0000     
+  1  3 11  3   2.5000  23.9443   1.0000  -3.2267  -1.0000   0.0000   0.0000     
+  1  1  3 11   0.9114  62.5039  -0.2389  -3.2976  -1.0000   0.0000   0.0000     
+  2  1  3 11   0.5000  35.0000   0.5000  -4.0000  -1.0000   0.0000   0.0000     
+  9    ! Nr of hydrogen bonds;at1;at2;at3;Rhb;Dehb;vhb1                         
+  3  2  3   2.1845  -2.3549   3.0582  19.1627                                   
+  3  2  4   2.0000  -6.0000   1.7976   3.0000                                   
+  4  2  3   1.2000  -2.0000   1.7976   3.0000                                   
+  4  2  4   1.2979  -6.0000   1.7976   3.0000                                   
+  3  2  5   1.5000  -2.0000   1.7976   3.0000                                   
+  4  2  5   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  3   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  4   1.5000  -2.0000   1.7976   3.0000                                   
+  5  2  5   1.5000  -2.0000   1.7976   3.0000                                   
diff --git a/data/benchmarks/water/water_300.fgeo b/data/benchmarks/water/water_300.fgeo
new file mode 100644
index 0000000000000000000000000000000000000000..b4d7e7765cf5061519dc02d01fb9c7647fb0a2bb
--- /dev/null
+++ b/data/benchmarks/water/water_300.fgeo
@@ -0,0 +1,609 @@
+XTLGRF 200
+DESCRP Water_100                                                   
+CRYSTX    14.42850   14.42850   14.42850   90.00000   90.00000   90.00000
+FORMAT ATOM   (a6,1x,i5,1x,a5,1x,a3,1x,a1,1x,a5,3f10.5,1x,a5,i3,i2,1x,f8.5)
+HETATM     1 H                  11.89039  10.92609  13.94080    H   1 1  0.00000
+HETATM     2 O                  11.48174  10.12782  14.41875    O   1 1  0.00000
+HETATM     3 H                  10.83496   9.73341  13.78769    H   1 1  0.00000
+HETATM     4 H                   2.00369   4.19589   2.76519    H   1 1  0.00000
+HETATM     5 O                   1.16011   3.84526   2.27866    O   1 1  0.00000
+HETATM     6 H                   1.36455   3.81098   1.28883    H   1 1  0.00000
+HETATM     7 H                   2.59208   5.44424   7.09101    H   1 1  0.00000
+HETATM     8 O                   2.09827   6.13339   7.66317    O   1 1  0.00000
+HETATM     9 H                   1.62871   5.62610   8.41737    H   1 1  0.00000
+HETATM    10 H                  10.77665  13.12885   6.46024    H   1 1  0.00000
+HETATM    11 O                   9.94049  13.65955   6.69782    O   1 1  0.00000
+HETATM    12 H                   9.30917  13.61265   5.90504    H   1 1  0.00000
+HETATM    13 H                   2.77812  13.66632   0.84928    H   1 1  0.00000
+HETATM    14 O                   3.44667  12.93128   0.76392    O   1 1  0.00000
+HETATM    15 H                   2.93625  12.14380   0.38234    H   1 1  0.00000
+HETATM    16 H                  11.96547   7.16313   1.46613    H   1 1  0.00000
+HETATM    17 O                  11.07002   7.58925   1.53798    O   1 1  0.00000
+HETATM    18 H                  11.13244   8.44464   1.03346    H   1 1  0.00000
+HETATM    19 H                   0.48826   6.53943   1.34756    H   1 1  0.00000
+HETATM    20 O                  13.90379   6.37087   1.42594    O   1 1  0.00000
+HETATM    21 H                  13.74365   5.86742   2.31008    H   1 1  0.00000
+HETATM    22 H                   1.73785  11.50613   9.64343    H   1 1  0.00000
+HETATM    23 O                   1.63467  12.40948  10.06792    O   1 1  0.00000
+HETATM    24 H                   2.56767  12.80601  10.10735    H   1 1  0.00000
+HETATM    25 H                   8.26956   1.06471   5.41504    H   1 1  0.00000
+HETATM    26 O                   8.41719   2.01731   5.62797    O   1 1  0.00000
+HETATM    27 H                   9.36905   2.19380   5.39265    H   1 1  0.00000
+HETATM    28 H                   1.74090  10.02796  13.76343    H   1 1  0.00000
+HETATM    29 O                   1.66864  10.87180  14.31579    O   1 1  0.00000
+HETATM    30 H                   1.27167  10.63186   0.81033    H   1 1  0.00000
+HETATM    31 H                   5.16473  13.77242   9.92069    H   1 1  0.00000
+HETATM    32 O                   4.36013  13.49093  10.41874    O   1 1  0.00000
+HETATM    33 H                   4.68160  12.78804  11.04842    H   1 1  0.00000
+HETATM    34 H                   6.53736  11.60845  12.46428    H   1 1  0.00000
+HETATM    35 O                   5.55387  11.48598  12.25238    O   1 1  0.00000
+HETATM    36 H                   5.15747  11.04335  13.06629    H   1 1  0.00000
+HETATM    37 H                   8.14741   2.86791  11.05447    H   1 1  0.00000
+HETATM    38 O                   8.16028   2.78040  12.09376    O   1 1  0.00000
+HETATM    39 H                   8.83238   2.07204  12.33207    H   1 1  0.00000
+HETATM    40 H                   9.13243  13.28408  11.90437    H   1 1  0.00000
+HETATM    41 O                   9.17499  14.15344  11.40697    O   1 1  0.00000
+HETATM    42 H                   8.47943  14.08442  10.69137    H   1 1  0.00000
+HETATM    43 H                   0.23824   9.02352   5.11355    H   1 1  0.00000
+HETATM    44 O                   0.51369   9.32541   6.06124    O   1 1  0.00000
+HETATM    45 H                   0.84094  10.30394   5.97479    H   1 1  0.00000
+HETATM    46 H                   0.09101   3.13071  13.57319    H   1 1  0.00000
+HETATM    47 O                  13.56428   2.69387  13.61661    O   1 1  0.00000
+HETATM    48 H                  12.92379   3.40106  14.06682    H   1 1  0.00000
+HETATM    49 H                   9.63841  11.82533   2.11177    H   1 1  0.00000
+HETATM    50 O                  10.26788  11.06697   2.42009    O   1 1  0.00000
+HETATM    51 H                  10.69893  10.66171   1.58403    H   1 1  0.00000
+HETATM    52 H                  10.74626   7.44686  11.50115    H   1 1  0.00000
+HETATM    53 O                   9.79623   7.18644  11.66065    O   1 1  0.00000
+HETATM    54 H                   9.30045   7.56287  10.88320    H   1 1  0.00000
+HETATM    55 H                  13.01689   2.78112   7.45509    H   1 1  0.00000
+HETATM    56 O                  12.28307   3.26915   7.98177    O   1 1  0.00000
+HETATM    57 H                  11.56497   2.56216   8.18970    H   1 1  0.00000
+HETATM    58 H                  12.66697   4.88417   7.29077    H   1 1  0.00000
+HETATM    59 O                  13.21411   5.52272   6.73282    O   1 1  0.00000
+HETATM    60 H                  14.14902   5.44787   7.04520    H   1 1  0.00000
+HETATM    61 H                  11.38359   3.39062   4.48995    H   1 1  0.00000
+HETATM    62 O                  11.34236   2.45884   4.79607    O   1 1  0.00000
+HETATM    63 H                  12.05673   2.39111   5.47306    H   1 1  0.00000
+HETATM    64 H                   6.85249   0.56423   9.32327    H   1 1  0.00000
+HETATM    65 O                   6.34917   1.38247   8.97670    O   1 1  0.00000
+HETATM    66 H                   5.70280   1.71432   9.72323    H   1 1  0.00000
+HETATM    67 H                  11.94067   7.63769  10.15591    H   1 1  0.00000
+HETATM    68 O                  11.51696   7.19535   9.32255    O   1 1  0.00000
+HETATM    69 H                  11.77027   7.76911   8.52063    H   1 1  0.00000
+HETATM    70 H                  12.24961   5.61397  10.13643    H   1 1  0.00000
+HETATM    71 O                  12.57578   4.67787  10.34072    O   1 1  0.00000
+HETATM    72 H                  12.27019   4.09070   9.57171    H   1 1  0.00000
+HETATM    73 H                   8.66021  10.79653  13.31026    H   1 1  0.00000
+HETATM    74 O                   8.44665  11.73295  13.07620    O   1 1  0.00000
+HETATM    75 H                   8.55918  12.24801  13.92151    H   1 1  0.00000
+HETATM    76 H                  13.81202   6.42247  13.89470    H   1 1  0.00000
+HETATM    77 O                  13.75234   6.37764  12.88706    O   1 1  0.00000
+HETATM    78 H                   0.20696   6.00962  12.53920    H   1 1  0.00000
+HETATM    79 H                  14.41565   1.91356   5.22336    H   1 1  0.00000
+HETATM    80 O                  14.12787   1.99859   6.19450    O   1 1  0.00000
+HETATM    81 H                   0.50741   1.75914   6.77205    H   1 1  0.00000
+HETATM    82 H                   2.65222   7.58488   1.63764    H   1 1  0.00000
+HETATM    83 O                   2.16243   7.05951   0.95996    O   1 1  0.00000
+HETATM    84 H                   2.27479   7.53302   0.08402    H   1 1  0.00000
+HETATM    85 H                   4.22555  10.30181   1.00717    H   1 1  0.00000
+HETATM    86 O                   4.63173   9.84242   0.20578    O   1 1  0.00000
+HETATM    87 H                   5.09583   8.98645   0.55128    H   1 1  0.00000
+HETATM    88 H                   8.06792   5.31674   2.33112    H   1 1  0.00000
+HETATM    89 O                   7.67942   4.81987   3.14372    O   1 1  0.00000
+HETATM    90 H                   6.88313   4.30135   2.78128    H   1 1  0.00000
+HETATM    91 H                   4.76362   6.55180  10.61359    H   1 1  0.00000
+HETATM    92 O                   4.26951   7.35097  11.05470    O   1 1  0.00000
+HETATM    93 H                   4.80754   8.19983  10.81557    H   1 1  0.00000
+HETATM    94 H                   9.09728   5.25359  14.38060    H   1 1  0.00000
+HETATM    95 O                   8.77589   5.81488   0.74034    O   1 1  0.00000
+HETATM    96 H                   9.58872   6.29919   1.09909    H   1 1  0.00000
+HETATM    97 H                  12.62840  12.11080   7.36656    H   1 1  0.00000
+HETATM    98 O                  12.17397  11.84284   6.46343    O   1 1  0.00000
+HETATM    99 H                  12.68322  12.33234   5.72117    H   1 1  0.00000
+HETATM   100 H                   0.19738  11.57590   2.47466    H   1 1  0.00000
+HETATM   101 O                   0.88137  10.81831   2.52408    O   1 1  0.00000
+HETATM   102 H                   0.40626  10.00473   2.92615    H   1 1  0.00000
+HETATM   103 H                  13.40477   2.07917  11.85112    H   1 1  0.00000
+HETATM   104 O                  13.55904   1.96064  10.85681    O   1 1  0.00000
+HETATM   105 H                  13.14848   2.76093  10.41951    H   1 1  0.00000
+HETATM   106 H                   8.27115   8.65789   9.07103    H   1 1  0.00000
+HETATM   107 O                   8.31413   9.27161   9.89500    O   1 1  0.00000
+HETATM   108 H                   8.74814  10.13297   9.59304    H   1 1  0.00000
+HETATM   109 H                   9.85409   5.52272  12.16165    H   1 1  0.00000
+HETATM   110 O                   9.85917   4.69413  12.75730    O   1 1  0.00000
+HETATM   111 H                   9.08190   4.06564  12.45541    H   1 1  0.00000
+HETATM   112 H                  14.39397   0.55202   0.82526    H   1 1  0.00000
+HETATM   113 O                   0.91952   0.43505   1.14447    O   1 1  0.00000
+HETATM   114 H                   1.52392   0.87187   0.45884    H   1 1  0.00000
+HETATM   115 H                   2.05632  12.60340   5.79305    H   1 1  0.00000
+HETATM   116 O                   1.41753  11.95049   5.40231    O   1 1  0.00000
+HETATM   117 H                   0.58673  12.46645   5.14123    H   1 1  0.00000
+HETATM   118 H                  12.14835  13.34252  13.62805    H   1 1  0.00000
+HETATM   119 O                  12.28908  12.52844  13.06205    O   1 1  0.00000
+HETATM   120 H                  13.30036  12.53701  12.78694    H   1 1  0.00000
+HETATM   121 H                   0.81366   9.32043   8.01780    H   1 1  0.00000
+HETATM   122 O                   1.13979   9.71399   8.88977    O   1 1  0.00000
+HETATM   123 H                   2.18904   9.72618   8.84144    H   1 1  0.00000
+HETATM   124 H                   1.81360   1.40383   8.92085    H   1 1  0.00000
+HETATM   125 O                   1.94895   1.58486   7.94172    O   1 1  0.00000
+HETATM   126 H                   2.56857   0.85742   7.59855    H   1 1  0.00000
+HETATM   127 H                   0.76917  12.45379  11.57984    H   1 1  0.00000
+HETATM   128 O                   0.49224  12.70790  12.54216    O   1 1  0.00000
+HETATM   129 H                   0.89610  12.00672  13.17809    H   1 1  0.00000
+HETATM   130 H                   5.23158  13.39868   0.95434    H   1 1  0.00000
+HETATM   131 O                   6.15196  13.82988   1.02055    O   1 1  0.00000
+HETATM   132 H                   6.16237   0.02548   1.85055    H   1 1  0.00000
+HETATM   133 H                   9.91205   2.09353   1.97805    H   1 1  0.00000
+HETATM   134 O                   9.14874   1.79349   1.43329    O   1 1  0.00000
+HETATM   135 H                   9.50205   1.65131   0.48434    H   1 1  0.00000
+HETATM   136 H                   3.27030   8.07868  12.33666    H   1 1  0.00000
+HETATM   137 O                   2.73916   8.66348  12.99656    O   1 1  0.00000
+HETATM   138 H                   3.43555   9.21028  13.55851    H   1 1  0.00000
+HETATM   139 H                   4.24205   3.90024   6.47552    H   1 1  0.00000
+HETATM   140 O                   3.25526   4.03544   6.25931    O   1 1  0.00000
+HETATM   141 H                   2.76772   3.31085   6.75414    H   1 1  0.00000
+HETATM   142 H                   3.43196   5.86576   3.55292    H   1 1  0.00000
+HETATM   143 O                   3.36741   4.81754   3.58883    O   1 1  0.00000
+HETATM   144 H                   3.25822   4.54718   4.56837    H   1 1  0.00000
+HETATM   145 H                  12.47169  11.81170   1.67083    H   1 1  0.00000
+HETATM   146 O                  13.14726  12.47789   1.95018    O   1 1  0.00000
+HETATM   147 H                  12.86787  13.33368   1.52019    H   1 1  0.00000
+HETATM   148 H                   5.60289  10.33165  11.00952    H   1 1  0.00000
+HETATM   149 O                   5.66604   9.62676  10.24192    O   1 1  0.00000
+HETATM   150 H                   6.68344   9.52392   9.99908    H   1 1  0.00000
+HETATM   151 H                   1.07569   3.50115   9.04134    H   1 1  0.00000
+HETATM   152 O                   1.00933   4.39227   9.47022    O   1 1  0.00000
+HETATM   153 H                   0.06626   4.45870   9.82333    H   1 1  0.00000
+HETATM   154 H                   9.17464   4.50946   8.06792    H   1 1  0.00000
+HETATM   155 O                   9.63665   5.18629   7.50549    O   1 1  0.00000
+HETATM   156 H                  10.53807   5.25505   7.88760    H   1 1  0.00000
+HETATM   157 H                   2.42390   1.13686  12.49844    H   1 1  0.00000
+HETATM   158 O                   2.62728   1.35071  13.49995    O   1 1  0.00000
+HETATM   159 H                   3.65727   1.39497  13.62891    H   1 1  0.00000
+HETATM   160 H                   5.58543  11.06656   3.37438    H   1 1  0.00000
+HETATM   161 O                   6.52786  11.07601   3.74582    O   1 1  0.00000
+HETATM   162 H                   6.96072  10.16387   3.52888    H   1 1  0.00000
+HETATM   163 H                   5.75892   0.92744   0.02950    H   1 1  0.00000
+HETATM   164 O                   5.41032   1.74970  13.97681    O   1 1  0.00000
+HETATM   165 H                   5.57938   2.53643   0.15187    H   1 1  0.00000
+HETATM   166 H                   0.83973   1.24334  10.94338    H   1 1  0.00000
+HETATM   167 O                   1.81422   0.94864  10.86160    O   1 1  0.00000
+HETATM   168 H                   1.79631  14.38910  10.69378    H   1 1  0.00000
+HETATM   169 H                   4.03756  10.28085   7.65991    H   1 1  0.00000
+HETATM   170 O                   3.61193  10.48950   8.55659    O   1 1  0.00000
+HETATM   171 H                   4.29281  10.21883   9.27183    H   1 1  0.00000
+HETATM   172 H                   3.43672   7.87126   4.59465    H   1 1  0.00000
+HETATM   173 O                   3.80096   7.50021   3.69692    O   1 1  0.00000
+HETATM   174 H                   4.80483   7.33897   3.82232    H   1 1  0.00000
+HETATM   175 H                  12.02930   0.72051   1.33337    H   1 1  0.00000
+HETATM   176 O                  12.60821   0.55206   0.48954    O   1 1  0.00000
+HETATM   177 H                  12.86509   1.46628   0.06692    H   1 1  0.00000
+HETATM   178 H                   1.34769   9.16811  11.91401    H   1 1  0.00000
+HETATM   179 O                   0.56747   9.65758  11.47911    O   1 1  0.00000
+HETATM   180 H                   0.81936   9.87754  10.50416    H   1 1  0.00000
+HETATM   181 H                   6.03239   4.82296   6.91503    H   1 1  0.00000
+HETATM   182 O                   5.74143   5.72775   7.29721    O   1 1  0.00000
+HETATM   183 H                   6.56677   6.32681   7.38693    H   1 1  0.00000
+HETATM   184 H                   4.73160   4.15122   2.36384    H   1 1  0.00000
+HETATM   185 O                   5.36549   3.82499   1.64463    O   1 1  0.00000
+HETATM   186 H                   5.17861   4.39064   0.82089    H   1 1  0.00000
+HETATM   187 H                   3.05821   7.48069   6.77948    H   1 1  0.00000
+HETATM   188 O                   3.03363   8.33728   6.21851    O   1 1  0.00000
+HETATM   189 H                   2.03858   8.65233   6.21954    H   1 1  0.00000
+HETATM   190 H                   7.24232   4.17528   9.74710    H   1 1  0.00000
+HETATM   191 O                   7.93383   3.48013   9.47235    O   1 1  0.00000
+HETATM   192 H                   7.41742   2.68958   9.03311    H   1 1  0.00000
+HETATM   193 H                   7.48963  12.63741   4.25126    H   1 1  0.00000
+HETATM   194 O                   7.97386  13.46400   4.60074    O   1 1  0.00000
+HETATM   195 H                   7.78036  14.20300   3.94151    H   1 1  0.00000
+HETATM   196 H                  10.09388   5.39527   5.47866    H   1 1  0.00000
+HETATM   197 O                  10.23218   5.48150   4.48622    O   1 1  0.00000
+HETATM   198 H                   9.40927   5.09201   4.05966    H   1 1  0.00000
+HETATM   199 H                  11.52459   0.29082   8.85727    H   1 1  0.00000
+HETATM   200 O                  10.79256   0.93792   8.69578    O   1 1  0.00000
+HETATM   201 H                  10.21192   0.52052   7.98526    H   1 1  0.00000
+HETATM   202 H                   5.02736   6.19983  14.25401    H   1 1  0.00000
+HETATM   203 O                   4.61928   5.39559  13.82455    O   1 1  0.00000
+HETATM   204 H                   3.92541   5.73916  13.21647    H   1 1  0.00000
+HETATM   205 H                   0.66905   2.37969   3.19318    H   1 1  0.00000
+HETATM   206 O                   0.68130   1.44315   3.64583    O   1 1  0.00000
+HETATM   207 H                   0.85761   0.77186   2.87502    H   1 1  0.00000
+HETATM   208 H                   2.14331   2.88155  13.71270    H   1 1  0.00000
+HETATM   209 O                   1.55668   3.74857  13.81249    O   1 1  0.00000
+HETATM   210 H                   1.80389   4.39447  13.05488    H   1 1  0.00000
+HETATM   211 H                   3.80947  11.86524   2.35434    H   1 1  0.00000
+HETATM   212 O                   3.72881  11.02019   2.89805    O   1 1  0.00000
+HETATM   213 H                   2.73463  10.85134   2.98810    H   1 1  0.00000
+HETATM   214 H                   6.72425   7.74089   0.38322    H   1 1  0.00000
+HETATM   215 O                   5.91930   7.54689   0.93249    O   1 1  0.00000
+HETATM   216 H                   6.23405   7.66104   1.87266    H   1 1  0.00000
+HETATM   217 H                   4.67747   9.17283   6.19407    H   1 1  0.00000
+HETATM   218 O                   5.39248   9.86086   6.38993    O   1 1  0.00000
+HETATM   219 H                   5.81115  10.10066   5.51420    H   1 1  0.00000
+HETATM   220 H                   3.58159   0.18094   5.79959    H   1 1  0.00000
+HETATM   221 O                   3.63587  14.06250   6.66500    O   1 1  0.00000
+HETATM   222 H                   4.55679  13.64021   6.70135    H   1 1  0.00000
+HETATM   223 H                  11.73577  10.36876  10.56570    H   1 1  0.00000
+HETATM   224 O                  11.39605  11.30282  10.54358    O   1 1  0.00000
+HETATM   225 H                  11.54975  11.66785  11.47432    H   1 1  0.00000
+HETATM   226 H                  13.72841   7.83234   3.16944    H   1 1  0.00000
+HETATM   227 O                  13.82911   8.72514   3.61566    O   1 1  0.00000
+HETATM   228 H                  12.88214   9.04934   3.87110    H   1 1  0.00000
+HETATM   229 H                   8.46345   8.53548   3.75163    H   1 1  0.00000
+HETATM   230 O                   7.57755   8.54833   3.23416    O   1 1  0.00000
+HETATM   231 H                   6.96474   7.89503   3.71654    H   1 1  0.00000
+HETATM   232 H                   6.90928  12.84404   5.99623    H   1 1  0.00000
+HETATM   233 O                   6.24801  12.67890   6.73644    O   1 1  0.00000
+HETATM   234 H                   6.02437  11.69712   6.72862    H   1 1  0.00000
+HETATM   235 H                  11.98796   5.61338   3.80579    H   1 1  0.00000
+HETATM   236 O                  12.99256   5.54308   3.90996    O   1 1  0.00000
+HETATM   237 H                  13.17263   5.47439   4.91630    H   1 1  0.00000
+HETATM   238 H                  13.03193   9.09182   6.74126    H   1 1  0.00000
+HETATM   239 O                  12.26053   9.14095   7.37437    O   1 1  0.00000
+HETATM   240 H                  11.95723  10.09113   7.35317    H   1 1  0.00000
+HETATM   241 H                  13.90163  14.35090   4.53187    H   1 1  0.00000
+HETATM   242 O                  13.55738  13.41348   4.64314    O   1 1  0.00000
+HETATM   243 H                  13.44697  13.03567   3.70528    H   1 1  0.00000
+HETATM   244 H                   3.50501   2.25792   5.04576    H   1 1  0.00000
+HETATM   245 O                   3.39440   1.42921   4.50681    O   1 1  0.00000
+HETATM   246 H                   2.46668   1.46934   4.10714    H   1 1  0.00000
+HETATM   247 H                   8.18514  12.66063   9.29387    H   1 1  0.00000
+HETATM   248 O                   7.33201  13.25093   9.36894    O   1 1  0.00000
+HETATM   249 H                   6.85581  13.16395   8.46402    H   1 1  0.00000
+HETATM   250 H                   5.10608   4.38135  10.25702    H   1 1  0.00000
+HETATM   251 O                   5.64376   5.19259   9.97293    O   1 1  0.00000
+HETATM   252 H                   5.53995   5.29785   8.94546    H   1 1  0.00000
+HETATM   253 H                   6.73954   2.71646   6.09963    H   1 1  0.00000
+HETATM   254 O                   5.87849   2.98427   6.57622    O   1 1  0.00000
+HETATM   255 H                   5.86319   2.44790   7.43512    H   1 1  0.00000
+HETATM   256 H                  11.73884  10.41650   5.32398    H   1 1  0.00000
+HETATM   257 O                  11.48991   9.82602   4.52549    O   1 1  0.00000
+HETATM   258 H                  11.01906  10.42292   3.80910    H   1 1  0.00000
+HETATM   259 H                   9.98869   0.74828  12.70919    H   1 1  0.00000
+HETATM   260 O                  10.45096   1.32190  13.41525    O   1 1  0.00000
+HETATM   261 H                  11.23829   0.79573  13.76925    H   1 1  0.00000
+HETATM   262 H                  12.57274   5.12722   0.73865    H   1 1  0.00000
+HETATM   263 O                  11.97733   4.34276   0.52257    O   1 1  0.00000
+HETATM   264 H                  11.30969   4.65317  14.26280    H   1 1  0.00000
+HETATM   265 H                   7.91645  13.32420   1.41435    H   1 1  0.00000
+HETATM   266 O                   8.89756  13.11488   1.27891    O   1 1  0.00000
+HETATM   267 H                   9.40012  13.90279   1.63934    H   1 1  0.00000
+HETATM   268 H                   8.22909   7.97370   6.78129    H   1 1  0.00000
+HETATM   269 O                   8.03768   7.50121   7.65443    O   1 1  0.00000
+HETATM   270 H                   8.69777   6.71196   7.68665    H   1 1  0.00000
+HETATM   271 H                   9.83539   7.18529   4.88263    H   1 1  0.00000
+HETATM   272 O                   9.52308   8.09659   5.21710    O   1 1  0.00000
+HETATM   273 H                  10.30574   8.75913   5.08506    H   1 1  0.00000
+HETATM   274 H                  13.49921   8.96203  11.34116    H   1 1  0.00000
+HETATM   275 O                  12.63568   8.40513  11.53312    O   1 1  0.00000
+HETATM   276 H                  12.92814   7.62639  12.15442    H   1 1  0.00000
+HETATM   277 H                   3.82276   1.98527  10.69374    H   1 1  0.00000
+HETATM   278 O                   4.62133   2.57412  10.64115    O   1 1  0.00000
+HETATM   279 H                   5.00790   2.52878  11.54974    H   1 1  0.00000
+HETATM   280 H                   8.24503   7.73191  14.26742    H   1 1  0.00000
+HETATM   281 O                   8.48165   8.53718  13.73491    O   1 1  0.00000
+HETATM   282 H                   9.09857   8.20948  13.00581    H   1 1  0.00000
+HETATM   283 H                   6.22243   1.92792   3.01889    H   1 1  0.00000
+HETATM   284 O                   6.85380   1.17056   2.90634    O   1 1  0.00000
+HETATM   285 H                   7.64778   1.53548   2.36934    H   1 1  0.00000
+HETATM   286 H                  11.03544   1.05483   3.44376    H   1 1  0.00000
+HETATM   287 O                  11.19683   0.23796   2.88529    O   1 1  0.00000
+HETATM   288 H                  11.87807  14.14934   3.38075    H   1 1  0.00000
+HETATM   289 H                  12.41022  12.31821   9.55828    H   1 1  0.00000
+HETATM   290 O                  13.03689  12.77834   8.89232    O   1 1  0.00000
+HETATM   291 H                  13.96832  12.65154   9.23065    H   1 1  0.00000
+HETATM   292 H                   2.58384   6.43112  11.64218    H   1 1  0.00000
+HETATM   293 O                   1.86887   5.75557  11.87050    O   1 1  0.00000
+HETATM   294 H                   1.67388   5.25208  11.01503    H   1 1  0.00000
+HETATM   295 H                   9.65594  12.26476   7.83473    H   1 1  0.00000
+HETATM   296 O                   9.35766  11.64534   8.59312    O   1 1  0.00000
+HETATM   297 H                  10.13727  11.56682   9.24787    H   1 1  0.00000
+HETATM   298 H                   6.84946   5.86221   4.23984    H   1 1  0.00000
+HETATM   299 O                   6.18241   6.56040   4.62743    O   1 1  0.00000
+HETATM   300 H                   5.83272   6.21373   5.51267    H   1 1  0.00000
+FORMAT CONECT (a6,12i6)
+CONECT     1     2
+CONECT     2     1     3
+CONECT     3     2
+CONECT     4     5   143
+CONECT     5     4     6
+CONECT     6     5
+CONECT     7     8
+CONECT     8     7     9
+CONECT     9     8
+CONECT    10    11
+CONECT    11    10    12
+CONECT    12    11
+CONECT    13    14
+CONECT    14    13    15
+CONECT    15    14
+CONECT    16    17
+CONECT    17    16    18
+CONECT    18    17
+CONECT    19    20
+CONECT    20    19    21
+CONECT    21    20
+CONECT    22    23
+CONECT    23    22    24
+CONECT    24    23
+CONECT    25    26
+CONECT    26    25    27
+CONECT    27    26
+CONECT    28    29
+CONECT    29    28    30
+CONECT    30    29
+CONECT    31    32
+CONECT    32    31    33
+CONECT    33    32
+CONECT    34    35
+CONECT    35    34    36   148
+CONECT    36    35
+CONECT    37    38   191
+CONECT    38    37    39   111
+CONECT    39    38
+CONECT    40    41
+CONECT    41    40    42
+CONECT    42    41
+CONECT    43    44
+CONECT    44    43    45   189
+CONECT    45    44
+CONECT    46    47   209
+CONECT    47    46    48   177
+CONECT    48    47   263
+CONECT    49    50   266
+CONECT    50    49    51   258
+CONECT    51    50
+CONECT    52    53
+CONECT    53    52    54
+CONECT    54    53
+CONECT    55    56
+CONECT    56    55    57
+CONECT    57    56
+CONECT    58    59
+CONECT    59    58    60
+CONECT    60    59
+CONECT    61    62
+CONECT    62    61    63
+CONECT    63    62
+CONECT    64    65
+CONECT    65    64    66   192
+CONECT    66    65   278
+CONECT    67    68
+CONECT    68    67    69
+CONECT    69    68
+CONECT    70    71
+CONECT    71    70    72
+CONECT    72    71
+CONECT    73    74
+CONECT    74    73    75
+CONECT    75    74
+CONECT    76    77
+CONECT    77    76    78   276
+CONECT    78    77
+CONECT    79    80
+CONECT    80    79    81
+CONECT    81    80
+CONECT    82    83
+CONECT    83    82    84
+CONECT    84    83
+CONECT    85    86
+CONECT    86    85    87
+CONECT    87    86   215
+CONECT    88    89
+CONECT    89    88    90
+CONECT    90    89
+CONECT    91    92
+CONECT    92    91    93
+CONECT    93    92
+CONECT    94    95
+CONECT    95    94    96
+CONECT    96    95
+CONECT    97    98   290
+CONECT    98    97    99
+CONECT    99    98
+CONECT   100   101
+CONECT   101   100   102
+CONECT   102   101
+CONECT   103   104
+CONECT   104   103   105
+CONECT   105   104
+CONECT   106   107
+CONECT   107   106   108   150
+CONECT   108   107
+CONECT   109   110
+CONECT   110   109   111
+CONECT   111    38   110
+CONECT   112   113
+CONECT   113   112   114
+CONECT   114   113
+CONECT   115   116
+CONECT   116   115   117
+CONECT   117   116
+CONECT   118   119
+CONECT   119   118   120
+CONECT   120   119   128
+CONECT   121   122
+CONECT   122   121   123   180
+CONECT   123   122   170
+CONECT   124   125
+CONECT   125   124   126
+CONECT   126   125
+CONECT   127   128
+CONECT   128   120   127   129
+CONECT   129   128
+CONECT   130   131
+CONECT   131   130   132
+CONECT   132   131   284
+CONECT   133   134
+CONECT   134   133   135
+CONECT   135   134
+CONECT   136   137
+CONECT   137   136   138
+CONECT   138   137
+CONECT   139   140
+CONECT   140   139   141
+CONECT   141   140
+CONECT   142   143   173
+CONECT   143     4   142   144
+CONECT   144   143
+CONECT   145   146
+CONECT   146   145   147
+CONECT   147   146
+CONECT   148    35   149
+CONECT   149   148   150
+CONECT   150   107   149
+CONECT   151   152
+CONECT   152   151   153
+CONECT   153   152
+CONECT   154   155
+CONECT   155   154   156
+CONECT   156   155
+CONECT   157   158
+CONECT   158   157   159   208
+CONECT   159   158
+CONECT   160   161
+CONECT   161   160   162
+CONECT   162   161
+CONECT   163   164
+CONECT   164   163   165
+CONECT   165   164
+CONECT   166   167
+CONECT   167   166   168
+CONECT   168   167
+CONECT   169   170
+CONECT   170   123   169   171
+CONECT   171   170
+CONECT   172   173
+CONECT   173   142   172   174
+CONECT   174   173
+CONECT   175   176
+CONECT   176   175   177
+CONECT   177    47   176
+CONECT   178   179
+CONECT   179   178   180   274
+CONECT   180   122   179
+CONECT   181   182
+CONECT   182   181   183   252
+CONECT   183   182
+CONECT   184   185
+CONECT   185   184   186
+CONECT   186   185
+CONECT   187   188
+CONECT   188   187   189
+CONECT   189    44   188
+CONECT   190   191
+CONECT   191    37   190   192
+CONECT   192    65   191
+CONECT   193   194
+CONECT   194   193   195
+CONECT   195   194
+CONECT   196   197
+CONECT   197   196   198
+CONECT   198   197
+CONECT   199   200
+CONECT   200   199   201
+CONECT   201   200
+CONECT   202   203
+CONECT   203   202   204
+CONECT   204   203
+CONECT   205   206
+CONECT   206   205   207
+CONECT   207   206
+CONECT   208   158   209
+CONECT   209    46   208   210
+CONECT   210   209
+CONECT   211   212
+CONECT   212   211   213
+CONECT   213   212
+CONECT   214   215
+CONECT   215    87   214   216
+CONECT   216   215
+CONECT   217   218
+CONECT   218   217   219
+CONECT   219   218
+CONECT   220   221
+CONECT   221   220   222
+CONECT   222   221
+CONECT   223   224
+CONECT   224   223   225
+CONECT   225   224
+CONECT   226   227
+CONECT   227   226   228
+CONECT   228   227
+CONECT   229   230
+CONECT   230   229   231
+CONECT   231   230
+CONECT   232   233
+CONECT   233   232   234
+CONECT   234   233
+CONECT   235   236
+CONECT   236   235   237
+CONECT   237   236
+CONECT   238   239
+CONECT   239   238   240
+CONECT   240   239
+CONECT   241   242
+CONECT   242   241   243
+CONECT   243   242
+CONECT   244   245
+CONECT   245   244   246
+CONECT   246   245
+CONECT   247   248   296
+CONECT   248   247   249
+CONECT   249   248
+CONECT   250   251
+CONECT   251   250   252
+CONECT   252   182   251
+CONECT   253   254
+CONECT   254   253   255
+CONECT   255   254
+CONECT   256   257
+CONECT   257   256   258   273
+CONECT   258    50   257
+CONECT   259   260
+CONECT   260   259   261
+CONECT   261   260
+CONECT   262   263
+CONECT   263    48   262   264
+CONECT   264   263
+CONECT   265   266
+CONECT   266    49   265   267
+CONECT   267   266
+CONECT   268   269
+CONECT   269   268   270
+CONECT   270   269
+CONECT   271   272
+CONECT   272   271   273
+CONECT   273   257   272
+CONECT   274   179   275
+CONECT   275   274   276
+CONECT   276    77   275
+CONECT   277   278
+CONECT   278    66   277   279
+CONECT   279   278
+CONECT   280   281
+CONECT   281   280   282
+CONECT   282   281
+CONECT   283   284
+CONECT   284   132   283   285
+CONECT   285   284
+CONECT   286   287
+CONECT   287   286   288
+CONECT   288   287
+CONECT   289   290
+CONECT   290    97   289   291
+CONECT   291   290
+CONECT   292   293
+CONECT   293   292   294
+CONECT   294   293
+CONECT   295   296
+CONECT   296   247   295   297
+CONECT   297   296
+CONECT   298   299
+CONECT   299   298   300
+CONECT   300   299
+UNIT ENERGY   kcal
+ENERGY      -26333.776190
+END
+
diff --git a/data/benchmarks/water/water_300.pdb b/data/benchmarks/water/water_300.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..bd5fc228cab21a89aebf90a19728c4d02b7dc753
--- /dev/null
+++ b/data/benchmarks/water/water_300.pdb
@@ -0,0 +1,601 @@
+CRYST1  14.4285  14.4285  14.4285  90.00  90.00  90.00               0
+ATOM      1    H REX     1    11.8903910.9260913.94080  1.00  0.00      0    H  
+ATOM      2    O REX     1    11.4817410.1278214.41875  1.00  0.00      0    O  
+ATOM      3    H REX     1    10.83496 9.7334113.78769  1.00  0.00      0    H  
+ATOM      4    H REX     1     2.00369 4.19589 2.76519  1.00  0.00      0    H  
+ATOM      5    O REX     1     1.16011 3.84526 2.27866  1.00  0.00      0    O  
+ATOM      6    H REX     1     1.36455 3.81098 1.28883  1.00  0.00      0    H  
+ATOM      7    H REX     1     2.59208 5.44424 7.09101  1.00  0.00      0    H  
+ATOM      8    O REX     1     2.09827 6.13339 7.66317  1.00  0.00      0    O  
+ATOM      9    H REX     1     1.62871 5.62610 8.41737  1.00  0.00      0    H  
+ATOM     10    H REX     1    10.7766513.12885 6.46024  1.00  0.00      0    H  
+ATOM     11    O REX     1     9.9404913.65955 6.69782  1.00  0.00      0    O  
+ATOM     12    H REX     1     9.3091713.61265 5.90504  1.00  0.00      0    H  
+ATOM     13    H REX     1     2.7781213.66632 0.84928  1.00  0.00      0    H  
+ATOM     14    O REX     1     3.4466712.93128 0.76392  1.00  0.00      0    O  
+ATOM     15    H REX     1     2.9362512.14380 0.38234  1.00  0.00      0    H  
+ATOM     16    H REX     1    11.96547 7.16313 1.46613  1.00  0.00      0    H  
+ATOM     17    O REX     1    11.07002 7.58925 1.53798  1.00  0.00      0    O  
+ATOM     18    H REX     1    11.13244 8.44464 1.03346  1.00  0.00      0    H  
+ATOM     19    H REX     1     0.48826 6.53943 1.34756  1.00  0.00      0    H  
+ATOM     20    O REX     1    13.90379 6.37087 1.42594  1.00  0.00      0    O  
+ATOM     21    H REX     1    13.74365 5.86742 2.31008  1.00  0.00      0    H  
+ATOM     22    H REX     1     1.7378511.50613 9.64343  1.00  0.00      0    H  
+ATOM     23    O REX     1     1.6346712.4094810.06792  1.00  0.00      0    O  
+ATOM     24    H REX     1     2.5676712.8060110.10735  1.00  0.00      0    H  
+ATOM     25    H REX     1     8.26956 1.06471 5.41504  1.00  0.00      0    H  
+ATOM     26    O REX     1     8.41719 2.01731 5.62797  1.00  0.00      0    O  
+ATOM     27    H REX     1     9.36905 2.19380 5.39265  1.00  0.00      0    H  
+ATOM     28    H REX     1     1.7409010.0279613.76343  1.00  0.00      0    H  
+ATOM     29    O REX     1     1.6686410.8718014.31579  1.00  0.00      0    O  
+ATOM     30    H REX     1     1.2716710.63186 0.81033  1.00  0.00      0    H  
+ATOM     31    H REX     1     5.1647313.77242 9.92069  1.00  0.00      0    H  
+ATOM     32    O REX     1     4.3601313.4909310.41874  1.00  0.00      0    O  
+ATOM     33    H REX     1     4.6816012.7880411.04842  1.00  0.00      0    H  
+ATOM     34    H REX     1     6.5373611.6084512.46428  1.00  0.00      0    H  
+ATOM     35    O REX     1     5.5538711.4859812.25238  1.00  0.00      0    O  
+ATOM     36    H REX     1     5.1574711.0433513.06629  1.00  0.00      0    H  
+ATOM     37    H REX     1     8.14741 2.8679111.05447  1.00  0.00      0    H  
+ATOM     38    O REX     1     8.16028 2.7804012.09376  1.00  0.00      0    O  
+ATOM     39    H REX     1     8.83238 2.0720412.33207  1.00  0.00      0    H  
+ATOM     40    H REX     1     9.1324313.2840811.90437  1.00  0.00      0    H  
+ATOM     41    O REX     1     9.1749914.1534411.40697  1.00  0.00      0    O  
+ATOM     42    H REX     1     8.4794314.0844210.69137  1.00  0.00      0    H  
+ATOM     43    H REX     1     0.23824 9.02352 5.11355  1.00  0.00      0    H  
+ATOM     44    O REX     1     0.51369 9.32541 6.06124  1.00  0.00      0    O  
+ATOM     45    H REX     1     0.8409410.30394 5.97479  1.00  0.00      0    H  
+ATOM     46    H REX     1     0.09101 3.1307113.57319  1.00  0.00      0    H  
+ATOM     47    O REX     1    13.56428 2.6938713.61661  1.00  0.00      0    O  
+ATOM     48    H REX     1    12.92379 3.4010614.06682  1.00  0.00      0    H  
+ATOM     49    H REX     1     9.6384111.82533 2.11177  1.00  0.00      0    H  
+ATOM     50    O REX     1    10.2678811.06697 2.42009  1.00  0.00      0    O  
+ATOM     51    H REX     1    10.6989310.66171 1.58403  1.00  0.00      0    H  
+ATOM     52    H REX     1    10.74626 7.4468611.50115  1.00  0.00      0    H  
+ATOM     53    O REX     1     9.79623 7.1864411.66065  1.00  0.00      0    O  
+ATOM     54    H REX     1     9.30045 7.5628710.88320  1.00  0.00      0    H  
+ATOM     55    H REX     1    13.01689 2.78112 7.45509  1.00  0.00      0    H  
+ATOM     56    O REX     1    12.28307 3.26915 7.98177  1.00  0.00      0    O  
+ATOM     57    H REX     1    11.56497 2.56216 8.18970  1.00  0.00      0    H  
+ATOM     58    H REX     1    12.66697 4.88417 7.29077  1.00  0.00      0    H  
+ATOM     59    O REX     1    13.21411 5.52272 6.73282  1.00  0.00      0    O  
+ATOM     60    H REX     1    14.14902 5.44787 7.04520  1.00  0.00      0    H  
+ATOM     61    H REX     1    11.38359 3.39062 4.48995  1.00  0.00      0    H  
+ATOM     62    O REX     1    11.34236 2.45884 4.79607  1.00  0.00      0    O  
+ATOM     63    H REX     1    12.05673 2.39111 5.47306  1.00  0.00      0    H  
+ATOM     64    H REX     1     6.85249 0.56423 9.32327  1.00  0.00      0    H  
+ATOM     65    O REX     1     6.34917 1.38247 8.97670  1.00  0.00      0    O  
+ATOM     66    H REX     1     5.70280 1.71432 9.72323  1.00  0.00      0    H  
+ATOM     67    H REX     1    11.94067 7.6376910.15591  1.00  0.00      0    H  
+ATOM     68    O REX     1    11.51696 7.19535 9.32255  1.00  0.00      0    O  
+ATOM     69    H REX     1    11.77027 7.76911 8.52063  1.00  0.00      0    H  
+ATOM     70    H REX     1    12.24961 5.6139710.13643  1.00  0.00      0    H  
+ATOM     71    O REX     1    12.57578 4.6778710.34072  1.00  0.00      0    O  
+ATOM     72    H REX     1    12.27019 4.09070 9.57171  1.00  0.00      0    H  
+ATOM     73    H REX     1     8.6602110.7965313.31026  1.00  0.00      0    H  
+ATOM     74    O REX     1     8.4466511.7329513.07620  1.00  0.00      0    O  
+ATOM     75    H REX     1     8.5591812.2480113.92151  1.00  0.00      0    H  
+ATOM     76    H REX     1    13.81202 6.4224713.89470  1.00  0.00      0    H  
+ATOM     77    O REX     1    13.75234 6.3776412.88706  1.00  0.00      0    O  
+ATOM     78    H REX     1     0.20696 6.0096212.53920  1.00  0.00      0    H  
+ATOM     79    H REX     1    14.41565 1.91356 5.22336  1.00  0.00      0    H  
+ATOM     80    O REX     1    14.12787 1.99859 6.19450  1.00  0.00      0    O  
+ATOM     81    H REX     1     0.50741 1.75914 6.77205  1.00  0.00      0    H  
+ATOM     82    H REX     1     2.65222 7.58488 1.63764  1.00  0.00      0    H  
+ATOM     83    O REX     1     2.16243 7.05951 0.95996  1.00  0.00      0    O  
+ATOM     84    H REX     1     2.27479 7.53302 0.08402  1.00  0.00      0    H  
+ATOM     85    H REX     1     4.2255510.30181 1.00717  1.00  0.00      0    H  
+ATOM     86    O REX     1     4.63173 9.84242 0.20578  1.00  0.00      0    O  
+ATOM     87    H REX     1     5.09583 8.98645 0.55128  1.00  0.00      0    H  
+ATOM     88    H REX     1     8.06792 5.31674 2.33112  1.00  0.00      0    H  
+ATOM     89    O REX     1     7.67942 4.81987 3.14372  1.00  0.00      0    O  
+ATOM     90    H REX     1     6.88313 4.30135 2.78128  1.00  0.00      0    H  
+ATOM     91    H REX     1     4.76362 6.5518010.61359  1.00  0.00      0    H  
+ATOM     92    O REX     1     4.26951 7.3509711.05470  1.00  0.00      0    O  
+ATOM     93    H REX     1     4.80754 8.1998310.81557  1.00  0.00      0    H  
+ATOM     94    H REX     1     9.09728 5.2535914.38060  1.00  0.00      0    H  
+ATOM     95    O REX     1     8.77589 5.81488 0.74034  1.00  0.00      0    O  
+ATOM     96    H REX     1     9.58872 6.29919 1.09909  1.00  0.00      0    H  
+ATOM     97    H REX     1    12.6284012.11080 7.36656  1.00  0.00      0    H  
+ATOM     98    O REX     1    12.1739711.84284 6.46343  1.00  0.00      0    O  
+ATOM     99    H REX     1    12.6832212.33234 5.72117  1.00  0.00      0    H  
+ATOM    100    H REX     1     0.1973811.57590 2.47466  1.00  0.00      0    H  
+ATOM    101    O REX     1     0.8813710.81831 2.52408  1.00  0.00      0    O  
+ATOM    102    H REX     1     0.4062610.00473 2.92615  1.00  0.00      0    H  
+ATOM    103    H REX     1    13.40477 2.0791711.85112  1.00  0.00      0    H  
+ATOM    104    O REX     1    13.55904 1.9606410.85681  1.00  0.00      0    O  
+ATOM    105    H REX     1    13.14848 2.7609310.41951  1.00  0.00      0    H  
+ATOM    106    H REX     1     8.27115 8.65789 9.07103  1.00  0.00      0    H  
+ATOM    107    O REX     1     8.31413 9.27161 9.89500  1.00  0.00      0    O  
+ATOM    108    H REX     1     8.7481410.13297 9.59304  1.00  0.00      0    H  
+ATOM    109    H REX     1     9.85409 5.5227212.16165  1.00  0.00      0    H  
+ATOM    110    O REX     1     9.85917 4.6941312.75730  1.00  0.00      0    O  
+ATOM    111    H REX     1     9.08190 4.0656412.45541  1.00  0.00      0    H  
+ATOM    112    H REX     1    14.39397 0.55202 0.82526  1.00  0.00      0    H  
+ATOM    113    O REX     1     0.91952 0.43505 1.14447  1.00  0.00      0    O  
+ATOM    114    H REX     1     1.52392 0.87187 0.45884  1.00  0.00      0    H  
+ATOM    115    H REX     1     2.0563212.60340 5.79305  1.00  0.00      0    H  
+ATOM    116    O REX     1     1.4175311.95049 5.40231  1.00  0.00      0    O  
+ATOM    117    H REX     1     0.5867312.46645 5.14123  1.00  0.00      0    H  
+ATOM    118    H REX     1    12.1483513.3425213.62805  1.00  0.00      0    H  
+ATOM    119    O REX     1    12.2890812.5284413.06205  1.00  0.00      0    O  
+ATOM    120    H REX     1    13.3003612.5370112.78694  1.00  0.00      0    H  
+ATOM    121    H REX     1     0.81366 9.32043 8.01780  1.00  0.00      0    H  
+ATOM    122    O REX     1     1.13979 9.71399 8.88977  1.00  0.00      0    O  
+ATOM    123    H REX     1     2.18904 9.72618 8.84144  1.00  0.00      0    H  
+ATOM    124    H REX     1     1.81360 1.40383 8.92085  1.00  0.00      0    H  
+ATOM    125    O REX     1     1.94895 1.58486 7.94172  1.00  0.00      0    O  
+ATOM    126    H REX     1     2.56857 0.85742 7.59855  1.00  0.00      0    H  
+ATOM    127    H REX     1     0.7691712.4537911.57984  1.00  0.00      0    H  
+ATOM    128    O REX     1     0.4922412.7079012.54216  1.00  0.00      0    O  
+ATOM    129    H REX     1     0.8961012.0067213.17809  1.00  0.00      0    H  
+ATOM    130    H REX     1     5.2315813.39868 0.95434  1.00  0.00      0    H  
+ATOM    131    O REX     1     6.1519613.82988 1.02055  1.00  0.00      0    O  
+ATOM    132    H REX     1     6.16237 0.02548 1.85055  1.00  0.00      0    H  
+ATOM    133    H REX     1     9.91205 2.09353 1.97805  1.00  0.00      0    H  
+ATOM    134    O REX     1     9.14874 1.79349 1.43329  1.00  0.00      0    O  
+ATOM    135    H REX     1     9.50205 1.65131 0.48434  1.00  0.00      0    H  
+ATOM    136    H REX     1     3.27030 8.0786812.33666  1.00  0.00      0    H  
+ATOM    137    O REX     1     2.73916 8.6634812.99656  1.00  0.00      0    O  
+ATOM    138    H REX     1     3.43555 9.2102813.55851  1.00  0.00      0    H  
+ATOM    139    H REX     1     4.24205 3.90024 6.47552  1.00  0.00      0    H  
+ATOM    140    O REX     1     3.25526 4.03544 6.25931  1.00  0.00      0    O  
+ATOM    141    H REX     1     2.76772 3.31085 6.75414  1.00  0.00      0    H  
+ATOM    142    H REX     1     3.43196 5.86576 3.55292  1.00  0.00      0    H  
+ATOM    143    O REX     1     3.36741 4.81754 3.58883  1.00  0.00      0    O  
+ATOM    144    H REX     1     3.25822 4.54718 4.56837  1.00  0.00      0    H  
+ATOM    145    H REX     1    12.4716911.81170 1.67083  1.00  0.00      0    H  
+ATOM    146    O REX     1    13.1472612.47789 1.95018  1.00  0.00      0    O  
+ATOM    147    H REX     1    12.8678713.33368 1.52019  1.00  0.00      0    H  
+ATOM    148    H REX     1     5.6028910.3316511.00952  1.00  0.00      0    H  
+ATOM    149    O REX     1     5.66604 9.6267610.24192  1.00  0.00      0    O  
+ATOM    150    H REX     1     6.68344 9.52392 9.99908  1.00  0.00      0    H  
+ATOM    151    H REX     1     1.07569 3.50115 9.04134  1.00  0.00      0    H  
+ATOM    152    O REX     1     1.00933 4.39227 9.47022  1.00  0.00      0    O  
+ATOM    153    H REX     1     0.06626 4.45870 9.82333  1.00  0.00      0    H  
+ATOM    154    H REX     1     9.17464 4.50946 8.06792  1.00  0.00      0    H  
+ATOM    155    O REX     1     9.63665 5.18629 7.50549  1.00  0.00      0    O  
+ATOM    156    H REX     1    10.53807 5.25505 7.88760  1.00  0.00      0    H  
+ATOM    157    H REX     1     2.42390 1.1368612.49844  1.00  0.00      0    H  
+ATOM    158    O REX     1     2.62728 1.3507113.49995  1.00  0.00      0    O  
+ATOM    159    H REX     1     3.65727 1.3949713.62891  1.00  0.00      0    H  
+ATOM    160    H REX     1     5.5854311.06656 3.37438  1.00  0.00      0    H  
+ATOM    161    O REX     1     6.5278611.07601 3.74582  1.00  0.00      0    O  
+ATOM    162    H REX     1     6.9607210.16387 3.52888  1.00  0.00      0    H  
+ATOM    163    H REX     1     5.75892 0.92744 0.02950  1.00  0.00      0    H  
+ATOM    164    O REX     1     5.41032 1.7497013.97681  1.00  0.00      0    O  
+ATOM    165    H REX     1     5.57938 2.53643 0.15187  1.00  0.00      0    H  
+ATOM    166    H REX     1     0.83973 1.2433410.94338  1.00  0.00      0    H  
+ATOM    167    O REX     1     1.81422 0.9486410.86160  1.00  0.00      0    O  
+ATOM    168    H REX     1     1.7963114.3891010.69378  1.00  0.00      0    H  
+ATOM    169    H REX     1     4.0375610.28085 7.65991  1.00  0.00      0    H  
+ATOM    170    O REX     1     3.6119310.48950 8.55659  1.00  0.00      0    O  
+ATOM    171    H REX     1     4.2928110.21883 9.27183  1.00  0.00      0    H  
+ATOM    172    H REX     1     3.43672 7.87126 4.59465  1.00  0.00      0    H  
+ATOM    173    O REX     1     3.80096 7.50021 3.69692  1.00  0.00      0    O  
+ATOM    174    H REX     1     4.80483 7.33897 3.82232  1.00  0.00      0    H  
+ATOM    175    H REX     1    12.02930 0.72051 1.33337  1.00  0.00      0    H  
+ATOM    176    O REX     1    12.60821 0.55206 0.48954  1.00  0.00      0    O  
+ATOM    177    H REX     1    12.86509 1.46628 0.06692  1.00  0.00      0    H  
+ATOM    178    H REX     1     1.34769 9.1681111.91401  1.00  0.00      0    H  
+ATOM    179    O REX     1     0.56747 9.6575811.47911  1.00  0.00      0    O  
+ATOM    180    H REX     1     0.81936 9.8775410.50416  1.00  0.00      0    H  
+ATOM    181    H REX     1     6.03239 4.82296 6.91503  1.00  0.00      0    H  
+ATOM    182    O REX     1     5.74143 5.72775 7.29721  1.00  0.00      0    O  
+ATOM    183    H REX     1     6.56677 6.32681 7.38693  1.00  0.00      0    H  
+ATOM    184    H REX     1     4.73160 4.15122 2.36384  1.00  0.00      0    H  
+ATOM    185    O REX     1     5.36549 3.82499 1.64463  1.00  0.00      0    O  
+ATOM    186    H REX     1     5.17861 4.39064 0.82089  1.00  0.00      0    H  
+ATOM    187    H REX     1     3.05821 7.48069 6.77948  1.00  0.00      0    H  
+ATOM    188    O REX     1     3.03363 8.33728 6.21851  1.00  0.00      0    O  
+ATOM    189    H REX     1     2.03858 8.65233 6.21954  1.00  0.00      0    H  
+ATOM    190    H REX     1     7.24232 4.17528 9.74710  1.00  0.00      0    H  
+ATOM    191    O REX     1     7.93383 3.48013 9.47235  1.00  0.00      0    O  
+ATOM    192    H REX     1     7.41742 2.68958 9.03311  1.00  0.00      0    H  
+ATOM    193    H REX     1     7.4896312.63741 4.25126  1.00  0.00      0    H  
+ATOM    194    O REX     1     7.9738613.46400 4.60074  1.00  0.00      0    O  
+ATOM    195    H REX     1     7.7803614.20300 3.94151  1.00  0.00      0    H  
+ATOM    196    H REX     1    10.09388 5.39527 5.47866  1.00  0.00      0    H  
+ATOM    197    O REX     1    10.23218 5.48150 4.48622  1.00  0.00      0    O  
+ATOM    198    H REX     1     9.40927 5.09201 4.05966  1.00  0.00      0    H  
+ATOM    199    H REX     1    11.52459 0.29082 8.85727  1.00  0.00      0    H  
+ATOM    200    O REX     1    10.79256 0.93792 8.69578  1.00  0.00      0    O  
+ATOM    201    H REX     1    10.21192 0.52052 7.98526  1.00  0.00      0    H  
+ATOM    202    H REX     1     5.02736 6.1998314.25401  1.00  0.00      0    H  
+ATOM    203    O REX     1     4.61928 5.3955913.82455  1.00  0.00      0    O  
+ATOM    204    H REX     1     3.92541 5.7391613.21647  1.00  0.00      0    H  
+ATOM    205    H REX     1     0.66905 2.37969 3.19318  1.00  0.00      0    H  
+ATOM    206    O REX     1     0.68130 1.44315 3.64583  1.00  0.00      0    O  
+ATOM    207    H REX     1     0.85761 0.77186 2.87502  1.00  0.00      0    H  
+ATOM    208    H REX     1     2.14331 2.8815513.71270  1.00  0.00      0    H  
+ATOM    209    O REX     1     1.55668 3.7485713.81249  1.00  0.00      0    O  
+ATOM    210    H REX     1     1.80389 4.3944713.05488  1.00  0.00      0    H  
+ATOM    211    H REX     1     3.8094711.86524 2.35434  1.00  0.00      0    H  
+ATOM    212    O REX     1     3.7288111.02019 2.89805  1.00  0.00      0    O  
+ATOM    213    H REX     1     2.7346310.85134 2.98810  1.00  0.00      0    H  
+ATOM    214    H REX     1     6.72425 7.74089 0.38322  1.00  0.00      0    H  
+ATOM    215    O REX     1     5.91930 7.54689 0.93249  1.00  0.00      0    O  
+ATOM    216    H REX     1     6.23405 7.66104 1.87266  1.00  0.00      0    H  
+ATOM    217    H REX     1     4.67747 9.17283 6.19407  1.00  0.00      0    H  
+ATOM    218    O REX     1     5.39248 9.86086 6.38993  1.00  0.00      0    O  
+ATOM    219    H REX     1     5.8111510.10066 5.51420  1.00  0.00      0    H  
+ATOM    220    H REX     1     3.58159 0.18094 5.79959  1.00  0.00      0    H  
+ATOM    221    O REX     1     3.6358714.06250 6.66500  1.00  0.00      0    O  
+ATOM    222    H REX     1     4.5567913.64021 6.70135  1.00  0.00      0    H  
+ATOM    223    H REX     1    11.7357710.3687610.56570  1.00  0.00      0    H  
+ATOM    224    O REX     1    11.3960511.3028210.54358  1.00  0.00      0    O  
+ATOM    225    H REX     1    11.5497511.6678511.47432  1.00  0.00      0    H  
+ATOM    226    H REX     1    13.72841 7.83234 3.16944  1.00  0.00      0    H  
+ATOM    227    O REX     1    13.82911 8.72514 3.61566  1.00  0.00      0    O  
+ATOM    228    H REX     1    12.88214 9.04934 3.87110  1.00  0.00      0    H  
+ATOM    229    H REX     1     8.46345 8.53548 3.75163  1.00  0.00      0    H  
+ATOM    230    O REX     1     7.57755 8.54833 3.23416  1.00  0.00      0    O  
+ATOM    231    H REX     1     6.96474 7.89503 3.71654  1.00  0.00      0    H  
+ATOM    232    H REX     1     6.9092812.84404 5.99623  1.00  0.00      0    H  
+ATOM    233    O REX     1     6.2480112.67890 6.73644  1.00  0.00      0    O  
+ATOM    234    H REX     1     6.0243711.69712 6.72862  1.00  0.00      0    H  
+ATOM    235    H REX     1    11.98796 5.61338 3.80579  1.00  0.00      0    H  
+ATOM    236    O REX     1    12.99256 5.54308 3.90996  1.00  0.00      0    O  
+ATOM    237    H REX     1    13.17263 5.47439 4.91630  1.00  0.00      0    H  
+ATOM    238    H REX     1    13.03193 9.09182 6.74126  1.00  0.00      0    H  
+ATOM    239    O REX     1    12.26053 9.14095 7.37437  1.00  0.00      0    O  
+ATOM    240    H REX     1    11.9572310.09113 7.35317  1.00  0.00      0    H  
+ATOM    241    H REX     1    13.9016314.35090 4.53187  1.00  0.00      0    H  
+ATOM    242    O REX     1    13.5573813.41348 4.64314  1.00  0.00      0    O  
+ATOM    243    H REX     1    13.4469713.03567 3.70528  1.00  0.00      0    H  
+ATOM    244    H REX     1     3.50501 2.25792 5.04576  1.00  0.00      0    H  
+ATOM    245    O REX     1     3.39440 1.42921 4.50681  1.00  0.00      0    O  
+ATOM    246    H REX     1     2.46668 1.46934 4.10714  1.00  0.00      0    H  
+ATOM    247    H REX     1     8.1851412.66063 9.29387  1.00  0.00      0    H  
+ATOM    248    O REX     1     7.3320113.25093 9.36894  1.00  0.00      0    O  
+ATOM    249    H REX     1     6.8558113.16395 8.46402  1.00  0.00      0    H  
+ATOM    250    H REX     1     5.10608 4.3813510.25702  1.00  0.00      0    H  
+ATOM    251    O REX     1     5.64376 5.19259 9.97293  1.00  0.00      0    O  
+ATOM    252    H REX     1     5.53995 5.29785 8.94546  1.00  0.00      0    H  
+ATOM    253    H REX     1     6.73954 2.71646 6.09963  1.00  0.00      0    H  
+ATOM    254    O REX     1     5.87849 2.98427 6.57622  1.00  0.00      0    O  
+ATOM    255    H REX     1     5.86319 2.44790 7.43512  1.00  0.00      0    H  
+ATOM    256    H REX     1    11.7388410.41650 5.32398  1.00  0.00      0    H  
+ATOM    257    O REX     1    11.48991 9.82602 4.52549  1.00  0.00      0    O  
+ATOM    258    H REX     1    11.0190610.42292 3.80910  1.00  0.00      0    H  
+ATOM    259    H REX     1     9.98869 0.7482812.70919  1.00  0.00      0    H  
+ATOM    260    O REX     1    10.45096 1.3219013.41525  1.00  0.00      0    O  
+ATOM    261    H REX     1    11.23829 0.7957313.76925  1.00  0.00      0    H  
+ATOM    262    H REX     1    12.57274 5.12722 0.73865  1.00  0.00      0    H  
+ATOM    263    O REX     1    11.97733 4.34276 0.52257  1.00  0.00      0    O  
+ATOM    264    H REX     1    11.30969 4.6531714.26280  1.00  0.00      0    H  
+ATOM    265    H REX     1     7.9164513.32420 1.41435  1.00  0.00      0    H  
+ATOM    266    O REX     1     8.8975613.11488 1.27891  1.00  0.00      0    O  
+ATOM    267    H REX     1     9.4001213.90279 1.63934  1.00  0.00      0    H  
+ATOM    268    H REX     1     8.22909 7.97370 6.78129  1.00  0.00      0    H  
+ATOM    269    O REX     1     8.03768 7.50121 7.65443  1.00  0.00      0    O  
+ATOM    270    H REX     1     8.69777 6.71196 7.68665  1.00  0.00      0    H  
+ATOM    271    H REX     1     9.83539 7.18529 4.88263  1.00  0.00      0    H  
+ATOM    272    O REX     1     9.52308 8.09659 5.21710  1.00  0.00      0    O  
+ATOM    273    H REX     1    10.30574 8.75913 5.08506  1.00  0.00      0    H  
+ATOM    274    H REX     1    13.49921 8.9620311.34116  1.00  0.00      0    H  
+ATOM    275    O REX     1    12.63568 8.4051311.53312  1.00  0.00      0    O  
+ATOM    276    H REX     1    12.92814 7.6263912.15442  1.00  0.00      0    H  
+ATOM    277    H REX     1     3.82276 1.9852710.69374  1.00  0.00      0    H  
+ATOM    278    O REX     1     4.62133 2.5741210.64115  1.00  0.00      0    O  
+ATOM    279    H REX     1     5.00790 2.5287811.54974  1.00  0.00      0    H  
+ATOM    280    H REX     1     8.24503 7.7319114.26742  1.00  0.00      0    H  
+ATOM    281    O REX     1     8.48165 8.5371813.73491  1.00  0.00      0    O  
+ATOM    282    H REX     1     9.09857 8.2094813.00581  1.00  0.00      0    H  
+ATOM    283    H REX     1     6.22243 1.92792 3.01889  1.00  0.00      0    H  
+ATOM    284    O REX     1     6.85380 1.17056 2.90634  1.00  0.00      0    O  
+ATOM    285    H REX     1     7.64778 1.53548 2.36934  1.00  0.00      0    H  
+ATOM    286    H REX     1    11.03544 1.05483 3.44376  1.00  0.00      0    H  
+ATOM    287    O REX     1    11.19683 0.23796 2.88529  1.00  0.00      0    O  
+ATOM    288    H REX     1    11.8780714.14934 3.38075  1.00  0.00      0    H  
+ATOM    289    H REX     1    12.4102212.31821 9.55828  1.00  0.00      0    H  
+ATOM    290    O REX     1    13.0368912.77834 8.89232  1.00  0.00      0    O  
+ATOM    291    H REX     1    13.9683212.65154 9.23065  1.00  0.00      0    H  
+ATOM    292    H REX     1     2.58384 6.4311211.64218  1.00  0.00      0    H  
+ATOM    293    O REX     1     1.86887 5.7555711.87050  1.00  0.00      0    O  
+ATOM    294    H REX     1     1.67388 5.2520811.01503  1.00  0.00      0    H  
+ATOM    295    H REX     1     9.6559412.26476 7.83473  1.00  0.00      0    H  
+ATOM    296    O REX     1     9.3576611.64534 8.59312  1.00  0.00      0    O  
+ATOM    297    H REX     1    10.1372711.56682 9.24787  1.00  0.00      0    H  
+ATOM    298    H REX     1     6.84946 5.86221 4.23984  1.00  0.00      0    H  
+ATOM    299    O REX     1     6.18241 6.56040 4.62743  1.00  0.00      0    O  
+ATOM    300    H REX     1     5.83272 6.21373 5.51267  1.00  0.00      0    H  
+CONECT    1    2
+CONECT    2    1    3
+CONECT    3    2
+CONECT    4    5  143
+CONECT    5    4    6
+CONECT    6    5
+CONECT    7    8
+CONECT    8    7    9
+CONECT    9    8
+CONECT   10   11
+CONECT   11   10   12
+CONECT   12   11
+CONECT   13   14
+CONECT   14   13   15
+CONECT   15   14
+CONECT   16   17
+CONECT   17   16   18
+CONECT   18   17
+CONECT   19   20
+CONECT   20   19   21
+CONECT   21   20
+CONECT   22   23
+CONECT   23   22   24
+CONECT   24   23
+CONECT   25   26
+CONECT   26   25   27
+CONECT   27   26
+CONECT   28   29
+CONECT   29   28   30
+CONECT   30   29
+CONECT   31   32
+CONECT   32   31   33
+CONECT   33   32
+CONECT   34   35
+CONECT   35   34   36  148
+CONECT   36   35
+CONECT   37   38  191
+CONECT   38   37   39  111
+CONECT   39   38
+CONECT   40   41
+CONECT   41   40   42
+CONECT   42   41
+CONECT   43   44
+CONECT   44   43   45  189
+CONECT   45   44
+CONECT   46   47  209
+CONECT   47   46   48  177
+CONECT   48   47  263
+CONECT   49   50  266
+CONECT   50   49   51  258
+CONECT   51   50
+CONECT   52   53
+CONECT   53   52   54
+CONECT   54   53
+CONECT   55   56
+CONECT   56   55   57
+CONECT   57   56
+CONECT   58   59
+CONECT   59   58   60
+CONECT   60   59
+CONECT   61   62
+CONECT   62   61   63
+CONECT   63   62
+CONECT   64   65
+CONECT   65   64   66  192
+CONECT   66   65  278
+CONECT   67   68
+CONECT   68   67   69
+CONECT   69   68
+CONECT   70   71
+CONECT   71   70   72
+CONECT   72   71
+CONECT   73   74
+CONECT   74   73   75
+CONECT   75   74
+CONECT   76   77
+CONECT   77   76   78  276
+CONECT   78   77
+CONECT   79   80
+CONECT   80   79   81
+CONECT   81   80
+CONECT   82   83
+CONECT   83   82   84
+CONECT   84   83
+CONECT   85   86
+CONECT   86   85   87
+CONECT   87   86  215
+CONECT   88   89
+CONECT   89   88   90
+CONECT   90   89
+CONECT   91   92
+CONECT   92   91   93
+CONECT   93   92
+CONECT   94   95
+CONECT   95   94   96
+CONECT   96   95
+CONECT   97   98  290
+CONECT   98   97   99
+CONECT   99   98
+CONECT  100  101
+CONECT  101  100  102
+CONECT  102  101
+CONECT  103  104
+CONECT  104  103  105
+CONECT  105  104
+CONECT  106  107
+CONECT  107  106  108  150
+CONECT  108  107
+CONECT  109  110
+CONECT  110  109  111
+CONECT  111   38  110
+CONECT  112  113
+CONECT  113  112  114
+CONECT  114  113
+CONECT  115  116
+CONECT  116  115  117
+CONECT  117  116
+CONECT  118  119
+CONECT  119  118  120
+CONECT  120  119  128
+CONECT  121  122
+CONECT  122  121  123  180
+CONECT  123  122  170
+CONECT  124  125
+CONECT  125  124  126
+CONECT  126  125
+CONECT  127  128
+CONECT  128  120  127  129
+CONECT  129  128
+CONECT  130  131
+CONECT  131  130  132
+CONECT  132  131  284
+CONECT  133  134
+CONECT  134  133  135
+CONECT  135  134
+CONECT  136  137
+CONECT  137  136  138
+CONECT  138  137
+CONECT  139  140
+CONECT  140  139  141
+CONECT  141  140
+CONECT  142  143  173
+CONECT  143    4  142  144
+CONECT  144  143
+CONECT  145  146
+CONECT  146  145  147
+CONECT  147  146
+CONECT  148   35  149
+CONECT  149  148  150
+CONECT  150  107  149
+CONECT  151  152
+CONECT  152  151  153
+CONECT  153  152
+CONECT  154  155
+CONECT  155  154  156
+CONECT  156  155
+CONECT  157  158
+CONECT  158  157  159  208
+CONECT  159  158
+CONECT  160  161
+CONECT  161  160  162
+CONECT  162  161
+CONECT  163  164
+CONECT  164  163  165
+CONECT  165  164
+CONECT  166  167
+CONECT  167  166  168
+CONECT  168  167
+CONECT  169  170
+CONECT  170  123  169  171
+CONECT  171  170
+CONECT  172  173
+CONECT  173  142  172  174
+CONECT  174  173
+CONECT  175  176
+CONECT  176  175  177
+CONECT  177   47  176
+CONECT  178  179
+CONECT  179  178  180  274
+CONECT  180  122  179
+CONECT  181  182
+CONECT  182  181  183  252
+CONECT  183  182
+CONECT  184  185
+CONECT  185  184  186
+CONECT  186  185
+CONECT  187  188
+CONECT  188  187  189
+CONECT  189   44  188
+CONECT  190  191
+CONECT  191   37  190  192
+CONECT  192   65  191
+CONECT  193  194
+CONECT  194  193  195
+CONECT  195  194
+CONECT  196  197
+CONECT  197  196  198
+CONECT  198  197
+CONECT  199  200
+CONECT  200  199  201
+CONECT  201  200
+CONECT  202  203
+CONECT  203  202  204
+CONECT  204  203
+CONECT  205  206
+CONECT  206  205  207
+CONECT  207  206
+CONECT  208  158  209
+CONECT  209   46  208  210
+CONECT  210  209
+CONECT  211  212
+CONECT  212  211  213
+CONECT  213  212
+CONECT  214  215
+CONECT  215   87  214  216
+CONECT  216  215
+CONECT  217  218
+CONECT  218  217  219
+CONECT  219  218
+CONECT  220  221
+CONECT  221  220  222
+CONECT  222  221
+CONECT  223  224
+CONECT  224  223  225
+CONECT  225  224
+CONECT  226  227
+CONECT  227  226  228
+CONECT  228  227
+CONECT  229  230
+CONECT  230  229  231
+CONECT  231  230
+CONECT  232  233
+CONECT  233  232  234
+CONECT  234  233
+CONECT  235  236
+CONECT  236  235  237
+CONECT  237  236
+CONECT  238  239
+CONECT  239  238  240
+CONECT  240  239
+CONECT  241  242
+CONECT  242  241  243
+CONECT  243  242
+CONECT  244  245
+CONECT  245  244  246
+CONECT  246  245
+CONECT  247  248  296
+CONECT  248  247  249
+CONECT  249  248
+CONECT  250  251
+CONECT  251  250  252
+CONECT  252  182  251
+CONECT  253  254
+CONECT  254  253  255
+CONECT  255  254
+CONECT  256  257
+CONECT  257  256  258  273
+CONECT  258   50  257
+CONECT  259  260
+CONECT  260  259  261
+CONECT  261  260
+CONECT  262  263
+CONECT  263   48  262  264
+CONECT  264  263
+CONECT  265  266
+CONECT  266   49  265  267
+CONECT  267  266
+CONECT  268  269
+CONECT  269  268  270
+CONECT  270  269
+CONECT  271  272
+CONECT  272  271  273
+CONECT  273  257  272
+CONECT  274  179  275
+CONECT  275  274  276
+CONECT  276   77  275
+CONECT  277  278
+CONECT  278   66  277  279
+CONECT  279  278
+CONECT  280  281
+CONECT  281  280  282
+CONECT  282  281
+CONECT  283  284
+CONECT  284  132  283  285
+CONECT  285  284
+CONECT  286  287
+CONECT  287  286  288
+CONECT  288  287
+CONECT  289  290
+CONECT  290   97  289  291
+CONECT  291  290
+CONECT  292  293
+CONECT  293  292  294
+CONECT  294  293
+CONECT  295  296
+CONECT  296  247  295  297
+CONECT  297  296
+CONECT  298  299
+CONECT  299  298  300
+CONECT  300  299
diff --git a/doc/README.txt b/doc/README.txt
index 18e6a73d8daedbd4232cce54e91f6ca004557cd8..50a7eb96b80e70e112f4c6a69abab32dcf57b3f5 100644
--- a/doc/README.txt
+++ b/doc/README.txt
@@ -21,7 +21,7 @@
 -------------------------------------------------------------------------
 -------------------------------------------------------------------------
 -------------   Instructions to get started with the codebase   ---------
--------------				PuReMD Package Release 1.0.0.0			 ---------
+-------------         PuReMD Package Release 1.0.0.0            ---------
 -------------------------------------------------------------------------
 -------------------------------------------------------------------------
 
@@ -29,10 +29,10 @@ VERSION - 1003
 
 1. Following package for Purdue Reactive Molecular Dynamics (PuReMD) 
 consists of the following implementations: 
-	a) serial (single cpu) implementation  -	sPuReMD
-	b) GPU (single GPU) implementation		 -	PuReMD-GPU
-	c) Parallel CPU (cluster of CPUs) implementation		- PuReMD
-	d) Parallel GPU (cluster of GPUs) implementation		- PG-PuReMD
+	a) serial (single cpu) implementation -	sPuReMD
+	b) GPU (single GPU) implementation - PuReMD-GPU
+	c) Parallel CPU (cluster of CPUs) implementation - PuReMD
+	d) Parallel GPU (cluster of GPUs) implementation - PG-PuReMD
 
 2. In the current implemtations only limited ensembles are supported
 in the GPU implementations (CPU implementations supports a wide array of 
diff --git a/doc/src/manual.tex b/doc/src/manual.tex
index 257aaee3af94222f0593ab8989a504afcd69681e..069adabcd1e90252d3d035804bfe37edd6571602 100644
--- a/doc/src/manual.tex
+++ b/doc/src/manual.tex
@@ -1,21 +1,33 @@
-%%
-%% This is the PuReMD manual.
-%%
+%% PuReMD manual
+
 \documentclass{article}
 
 \usepackage{hyperref}
+\usepackage{minted}
+\usepackage{listings}
+
+\lstset{
+  basicstyle=\ttfamily,
+  mathescape
+}
 
 
-\title{PuReMD Manual \\
-  (Purdue Reactive Molecular Dynamics Program)}
+\title{Manual for PuReMD: \\
+  {\bf Pu}rdue {\bf Re}active {\bf M}olecular {\bf D}ynamics Program}
 
-\author{Hasan Metin Aktulga}
+\author{
+  H. Metin Aktulga \\
+  \texttt{hma@cse.msu.edu} \\
+  \and
+  Kurt A. O'Hearn \\
+  \texttt{ohearnku@msu.edu}
+}
 
 \begin{document}
 
 \maketitle
 
-This manual is for the two simulation programs which have
+This manual is for the PuReMD software which has
 come to existence as a result of our ReaxFF realization efforts. 
 Our initial efforts have led to the SerialReax program, which is a 
 sequential implementation for ReaxFF. SerialReax has helped us in verifying 
@@ -38,7 +50,7 @@ manual, we take PuReMD as our basis and describe it first. In a following
 section, we describe the extras that come with SerialReax which we hope 
 to incorporate into PuReMD in the near future.
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 \section{Input Files}
 \label{sec:puremd_inp}
 
@@ -46,6 +58,7 @@ PuReMD expects 3 input files: a geometry file describing the system to be
 simulated, a force field file containing ReaxFF parameters and a control 
 file to manage simulation variables.
 
+
 \subsection{Geometry File}
 \label{sec:puremd_geo}
 
@@ -59,9 +72,9 @@ restart from an earlier simulation check-point using a restart file
 \label{sec:puremd_pdb}
 
 For detailed and up-to-date information on the PDB format, please visit 
-\url{http://www.wwpdb.org/docs.html}. Input files of various other formats 
+\href{http://www.wwpdb.org/docs.html}{here}. Input files of various other formats 
 can easily be converted to the pdb format using the freely available 
-OpenBabel software: (\url{http://openbabel.sourceforge.net/wiki/Main_Page}).
+\href{http://openbabel.sourceforge.net/wiki/Main_Page}{OpenBabel software}).
 
 In the geometry file, each atom is assigned a unique serial id to be
 able to identify atoms easily during the simulation. PDB format limits the 
@@ -79,16 +92,14 @@ follows: The first line describes the simulation box and the second line
 gives the total number of atoms in the system. These initial two lines need 
 to be followed by a single line for each atom describing it in detail. 
 Here is what a custom geo file looks like:
-\begin{verbatim}
+\begin{lstlisting}
 BOXGEO x_len y_len z_len alpha beta gamma
 N
 1 ele1  name1  x1 y1 z1
 2 ele2  name2  x2 y2 z2
-.
-.
-.
+$\vdots$
 N eleN  nameN  xN yN zN 
-\end{verbatim}
+\end{lstlisting}
 
 First three floating point numbers on the first line give the length of 
 the simulation box in x, y, z dimensions, the remaining ones are for the 
@@ -114,8 +125,9 @@ of 6 fields:
 
 Force field file contains the ReaxFF parameters to be used during 
 the simulation. Adri van Duin is the main developer and distributor 
-for Reax force fields, you can see his contact info at 
-\url{http://www.mne.psu.edu/vanduin/}.
+for Reax force fields, you can see his contact info
+\href{http://www.mne.psu.edu/vanduin}{here}.
+
 
 \subsection{Control File} 
 \label{sec:puremd_control}
@@ -123,21 +135,62 @@ for Reax force fields, you can see his contact info at
 Parameters in the control file allow the user to tune various simulation 
 options. Parameter names are case-sensitive but their order is not important 
 (except that {\tt ensemble\_type} needs to precede both {\tt p\_mass} and 
-{\tt pressure}). Described below are the fields that you might use in a 
-control file. If a parameter is missing from the control file, its default 
+{\tt pressure}). The table below lists all parameters names which the
+software recognizes, and each parameter is described in further detail
+below. If a parameter is missing from the control file, its default 
 value (as given in each parameter's description below) will be assumed.
 Each parameter must be specified in a single line, first token should be
 the parameter and the second token should be an appropriate value. 
 Comments regarding a parameter can be included after the value field 
 on the same line.
 
+\begin{center}
+\begin{tabular}{|c|c|} \hline
+  \hyperref[sec:simulation_name]{simulation\_name} & \hyperref[sec:ensemble_type]{ensemble\_type} \\ \hline
+  \hyperref[sec:nsteps]{nsteps} & \hyperref[sec:dt]{dt} \\ \hline
+  \hyperref[sec:reposition_atoms]{reposition\_atoms} & \hyperref[sec:restrict_bonds]{restrict\_bonds} \\ \hline
+  \hyperref[sec:tabulate_long_range]{tabulate\_long\_range} & \hyperref[sec:energy_update_freq]{energy\_update\_freq} \\ \hline
+  \hyperref[sec:remove_CoM_vel]{remove\_CoM\_vel} & \hyperref[sec:nbrhood_cutoff]{nbrhood\_cutoff} \\ \hline
+  \hyperref[sec:bond_graph_cutoff]{bond\_graph\_cutoff} & \hyperref[sec:thb_cutoff]{thb\_cutoff} \\ \hline
+  \hyperref[sec:hbond_cutoff]{hbond\_cutoff} & \hyperref[sec:charge_method]{charge\_method} \\ \hline
+  \hyperref[sec:cm_q_net]{cm\_q\_net} & \hyperref[sec:cm_solver_type]{cm\_solver\_type} \\ \hline
+  \hyperref[sec:cm_solver_max_iters]{cm\_solver\_max\_iters} & \hyperref[sec:cm_solver_restart]{cm\_solver\_restart} \\ \hline
+  \hyperref[sec:cm_solver_q_err]{cm\_solver\_q\_err} & \hyperref[sec:cm_domain_sparsity]{cm\_domain\_sparsity} \\ \hline
+  \hyperref[sec:cm_solver_pre_comp_type]{cm\_solver\_pre\_comp\_type} & \hyperref[sec:cm_solver_pre_comp_refactor]{cm\_solver\_pre\_comp\_refactor} \\ \hline
+  \hyperref[sec:cm_solver_pre_comp_droptol]{cm\_solver\_pre\_comp\_droptol} & \hyperref[sec:cm_solver_pre_comp_sweeps]{cm\_solver\_pre\_comp\_sweeps} \\ \hline
+  \hyperref[sec:cm_solver_pre_app_type]{cm\_solver\_pre\_app\_type} & \hyperref[sec:cm_solver_pre_app_jacobi_iters]{cm\_solver\_pre\_app\_jacobi\_iters} \\ \hline
+  \hyperref[sec:temp_init]{temp\_init} & \hyperref[sec:temp_final]{temp\_final} \\ \hline
+  \hyperref[sec:t_mass]{t\_mass} & \hyperref[sec:t_mode]{t\_mode} \\ \hline
+  \hyperref[sec:t_rate]{t\_rate} & \hyperref[sec:t_freq]{t\_freq} \\ \hline
+  \hyperref[sec:pressure]{pressure} & \hyperref[sec:p_mass]{p\_mass} \\ \hline
+  \hyperref[sec:compress]{compress} & \hyperref[sec:press_mode]{press\_mode} \\ \hline
+  \hyperref[sec:geo_format]{geo\_format} & \hyperref[sec:write_freq]{write\_freq} \\ \hline
+  \hyperref[sec:traj_compress]{traj\_compress} & \hyperref[sec:traj_format]{traj\_format} \\ \hline
+  \hyperref[sec:traj_title]{traj\_title} & \hyperref[sec:atom_info]{atom\_info} \\ \hline
+  \hyperref[sec:atom_forces]{atom\_forces} & \hyperref[sec:atom_velocities]{atom\_velocities} \\ \hline
+  \hyperref[sec:bond_info]{bond\_info} & \hyperref[sec:angle_info]{angle\_info} \\ \hline
+  \hyperref[sec:test_forces]{test\_forces} & \hyperref[sec:molec_anal]{molec\_anal} \\ \hline
+  \hyperref[sec:freq_molec_anal]{freq\_molec\_anal} & \hyperref[sec:dipole_anal]{dipole\_anal} \\ \hline
+  \hyperref[sec:freq_dipole_anal]{freq\_dipole\_anal} & \hyperref[sec:diffusion_coef]{diffusion\_coef} \\ \hline
+  \hyperref[sec:freq_diffusion_coef]{freq\_diffusion\_coef} & \hyperref[sec:restrict_type]{restrict\_type} \\ \hline
+  \hyperref[sec:restart_format]{restart\_format} & \hyperref[sec:restart_freq]{restart\_freq} \\ \hline
+\end{tabular}
+\end{center}
+
+\subsubsection{simulation\_name}
+\label{sec:simulation_name}
+
 \begin{verbatim}
   simulation_name    test_puremd
 \end{verbatim}
 Output files produced by PuReMD will be in 
 {\tt simulation\_name.some\_extension} format. Output files will be 
-discussed in more detail in Section~\ref{sec:puremd_output}. Default value 
-is {\tt simulate}.
+discussed in more detail in Section~\ref{sec:puremd_output}.
+
+Default: {\tt default.sim}
+
+\subsubsection{ensemble\_type}
+\label{sec:ensemble_type}
 
 \begin{verbatim}
   ensemble_type    1
@@ -153,28 +206,51 @@ PuReMD. Supported ensembles are as follows:
   \item 5: NPT: anisotropic NPT with Parrinello-Rehman coupling 
     (under development)
 \end{itemize}
-{\tt ensemble\_type} is NVE by default.
+
+Default: 0 (NVE)
+
+\subsubsection{nsteps}
+\label{sec:nsteps}
 
 \begin{verbatim}
   nsteps     1000
+\end{verbatim}
+{\tt nsteps} controls the total number of steps for the simulation.
+
+Default: 0
+
+\subsubsection{dt}
+\label{sec:dt}
+
+\begin{verbatim}
   dt         0.25
 \end{verbatim}
-{\tt nsteps} controls the total number of steps for the simulation and 
-{\tt dt} controls the length of each time step (measured in femtoseconds). 
-Number of steps is 0 by default and time step length is 0.25~fs.
+{\tt dt} controls the length of each time step (in femtoseconds). 
+
+Default: 0.25
+
+\subsubsection{proc\_by\_dim}
+\label{sec:proc_by_dim}
 
 \begin{verbatim}
   proc_by_dim     1 1 3
 \end{verbatim}
-PuReMD uses the domain decomposition technique to distribute the load
-among processors, it currently does not have dynamic load balancing.
+The distributed memory version of PuReMD uses the
+domain decomposition technique to distribute the load
+among processors. It currently does not have dynamic load balancing.
 {\tt proc\_by\_dim} denotes the desired decomposition of the simulation 
 box into subdomains (first integer is the number of equal-length 
 partitions in x dimension, second integer is for y dimension and 
 the last one is for z dimension). Each subdomain is subsequently assigned 
 to a single processor. PuReMD constructs a 3D torus based on the 
-{\tt proc\_by\_dim} parameter. The default is to use a single processor. 
-SerialReax does not accept the {\tt proc\_by\_dim} parameter.
+{\tt proc\_by\_dim} parameter.
+
+Default: 1 1 1 (single processor)
+
+Note: shared memory versions of PuReMD do not accept this parameter.
+
+\subsubsection{geo\_format}
+\label{sec:geo_format}
 
 \begin{verbatim}
   geo_format     0
@@ -193,7 +269,13 @@ to 2 (for ASCII restarts) or 3 (for binary restarts) and providing the name
 of the restart file as an argument to PuReMD (instead of the GEO file name).
 Then PuReMD will read the box geometry, positions and velocities for all 
 atoms in the system from the restart file and continue execution from thereon. 
-Default is the custom geometry format.
+
+Default: 0 (custom format)
+
+\subsubsection{restart\_format}
+\label{sec:restart_format}
+\subsubsection{restart\_freq}
+\label{sec:restart_freq}
 
 \begin{verbatim}
   restart_format   1
@@ -210,6 +292,11 @@ parameter is set to a positive integer. A restart file is named as follows:
 {\tt simulation\_name.resS} where {\tt S} denotes the step that the restart 
 file is written.
 
+Defaults: 0 (ASCII), 0
+
+\subsubsection{tabulate\_long\_range}
+\label{sec:tabulate_long_range}
+
 \begin{verbatim}
   tabulate_long_range    10000
 \end{verbatim}
@@ -225,7 +312,12 @@ the appropriate interpolation function is located and energy and forces
 between the atom pair is approximated by means of cubic spline interpolation.
 This method gives significant speed-up compared to computing everything from 
 scratch each time and with only 10000 sample points it is able to provide 
-results with an accuracy at machine precision level. Default is no tabulation.
+results with an accuracy at machine precision level.
+
+Default: 0 (no tabulation)
+
+\subsubsection{energy\_update\_freq}
+\label{sec:energy_update_freq}
 
 \begin{verbatim}
   energy_update_freq     10
@@ -233,8 +325,12 @@ results with an accuracy at machine precision level. Default is no tabulation.
 This option controls the frequency of writes into output files described 
 in detail in Section~\ref{sec:puremd_output} (except for the trajectory 
 and restart files which are controlled by other parameters explained
-separately). The default value for this parameter is 0, meaning there will 
-not be any energies and performance logs output.
+separately).
+
+Default: 0 (no energies in performance logs)
+
+\subsubsection{remove\_CoM\_vel}
+\label{sec:remove_CoM_vel}
 
 \begin{verbatim}
   remove_CoM_vel     500
@@ -243,8 +339,15 @@ Removal of translational and rotational velocities around the center of
 mass needs to be done for NVT and NPT type ensembles to remove the 
 nonphysical effects of scaling velocities. In case of NVE, this is  
 unnecessary and is not done regardless of the value of {\tt remove\_CoM\_vel}.
-The default is to remove translational and rotational velocities at 
-every 250 steps.
+
+Default: 25
+
+\subsubsection{nbrhood\_cutoff}
+\label{sec:nbrhood_cutoff}
+\subsubsection{thb\_cutoff}
+\label{sec:thb_cutoff}
+\subsubsection{hbond\_cutoff}
+\label{sec:hbond_cutoff}
 
 \begin{verbatim}
   nbrhood_cutoff     5.0     
@@ -253,20 +356,25 @@ every 250 steps.
 \end{verbatim}
 These cutoff parameters are crucial for the correctness and efficiency
 of PuReMD. Normally, bonded interactions are truncated after 4-5~\AA\ in 
-ReaxFF and this is controlled by the {\tt nbrhood\_cutoff} parameter 
-whose default value is 4~\AA.
+ReaxFF and this is controlled by the {\tt nbrhood\_cutoff} parameter.
 
 {\tt thb\_cutoff} sets the bond strength threshold for valence angle 
 interactions. Bonds which are weaker than {\tt thb\_cutoff} will not 
-be included in valence angle interactions. Default for {\tt thb\_cutoff} 
-is 0.001.
+be included in valence angle interactions.
 
 {\tt hbond\_cutoff} controls the distance between the donor and acceptor 
 atoms in a hydrogen bond interaction. Its typical value is from 6\AA\ to 
 7.5~\AA. If {\tt hbond\_cutoff} is set to 0, hydrogen bond interactions 
 will be turned off completely (could be useful for improved
 performance in simulations where it is known apriori that there are no 
-hydrogen bonding interactions). Default is to set {\tt hbond\_cutoff} to 0.
+hydrogen bonding interactions).
+
+Defaults: 4.0, 0.001, 0.0
+
+\subsubsection{reneighbor}
+\label{sec:reneighbor}
+\subsubsection{vlist\_buffer}
+\label{sec:vlist_buffer}
 
 \begin{verbatim}
   reneighbor     10
@@ -275,26 +383,134 @@ hydrogen bonding interactions). Default is to set {\tt hbond\_cutoff} to 0.
 PuReMD features delayed neighbor generation by using Verlet lists. 
 {\tt reneighbor} controls the reneighboring frequency and {\tt vlist\_buffer} 
 controls the buffer space beyond the maximum ReaxFF interaction cutoff. 
-By default, {\tt vlist\_buffer} is set to 0 and reneighboring is done at 
-every step.
+
+Defaults: 1 (reneighbor every step), 0
+
+\subsubsection{charge\_method}
+\label{sec:charge_method}
+\subsubsection{cm\_q\_net}
+\label{sec:cm_q_net}
+\subsubsection{cm\_solver\_type}
+\label{sec:cm_solver_type}
+\subsubsection{cm\_solver\_max\_iters}
+\label{sec:cm_solver_max_iters}
+\subsubsection{cm\_solver\_restart}
+\label{sec:cm_solver_restart}
+\subsubsection{cm\_solver\_q\_err}
+\label{sec:cm_solver_q_err}
+\subsubsection{cm\_solver\_sparsity\_enabled}
+\label{sec:cm_solver_sparsity_enabled}
+\subsubsection{cm\_solver\_sparsity}
+\label{sec:cm_solver_sparsity}
+\subsubsection{cm\_solver\_pre\_comp\_type}
+\label{sec:cm_solver_pre_comp_type}
+\subsubsection{cm\_solver\_pre\_comp\_sweeps}
+\label{sec:cm_solver_pre_comp_sweeps}
+\subsubsection{cm\_solver\_pre\_comp\_refactor}
+\label{sec:cm_solver_pre_comp_refactor}
+\subsubsection{cm\_solver\_pre\_comp\_droptol}
+\label{sec:cm_solver_pre_comp_droptol}
+\subsubsection{cm\_solver\_pre\_app\_type}
+\label{sec:cm_solver_pre_app_type}
+\subsubsection{cm\_solver\_pre\_app\_jacobi\_iters}
+\label{sec:cm_solver_pre_app_jacobi_iters}
 
 \begin{verbatim}
-  q_err        1e-6
-  qeq_freq     1
+  charge_method                     0
+  cm_q_net                          0.0
+  cm_solver_type                    0
+  cm_solver_max_iters               20
+  cm_solver_restart                 100
+  cm_solver_q_err                   1e-6
+  cm_domain_sparsity                1.0
+  cm_solver_pre_comp_type           1
+  cm_solver_pre_comp_refactor       1000
+  cm_solver_pre_comp_droptol        0.0
+  cm_solver_pre_comp_sweeps         3
+  cm_solver_pre_app_type            0
+  cm_solver_pre_app_jacobi_iters    50
 \end{verbatim}
-PuReMD uses a preconditioned conjugate gradients (PCG) solver with a 
-diagonal preconditioner for the QEq problem. {\tt q\_err} denotes the 
-stopping criteria for the PCG solver, the norm of the relative residual. 
-A lower threshold would yield more accurate equilibration of charges at 
-the expense of an increase in computation time. A threshold of $10^{-6}$ 
-should be good enough for most cases and this is the default value.
-
-{\tt qeq\_freq} can be used to perform charge equilibration at every 
+PuReMD uses one of several charge methods for dynamically
+determining atomic charges. Unpinning these charge methods
+are iterative linear solvers with optional preconditioning.
+
+{\tt charge\_method} controls which charge method is used.
+The options are charge equilibration
+(0), electronegivity equilibration (1), or atom-condensed Kohn-Sham
+approximated to second order (2).
+
+{\tt cm\_q\_net} controls the net system charge.
+
+{\tt cm\_solver\_type} controls which linear solver is used. Options
+are GMRES with restarts (0), Householder GMRES with restarts (1),
+conjugant gradient (2), and steepest descent (3).
+
+{\tt cm\_solver\_max\_iters} controls the maximum number of iterations
+the solver is allowed to perform in order to achieve convergence of
+the solution to within the reqiured tolerance.
+
+{\tt cm\_solver\_restart} controls the maximum number of inner iterations
+that GMRES-based solvers are allowed before performing a restart.
+
+{\tt cm\_solver\_q\_err} sets the the solution tolerance for the solver.
+
+{\tt cm\_solver\_sparsity} sets the sparsification ratio of the charge matrix
+used to compute the preconditioner. Specifically, an additional distance-based
+cutoff is applied to compute elements of the (sparsified) charge matrix, and
+this matrix is in turn used to compute a preconditioner. The value of this
+parameter is multipled by the current neighbor cutoff value to obtain the new
+cutoff; hence, a value between 0.0 and 1.0, exclusive, is expected.
+
+{\tt cm\_solver\_pre\_comp\_type} sets the type of preconditioner to be
+computed. Options are none (0), Jacobi/diagonal inverse (1), incomplete
+Cholesky with dual thresholding (2), incomplete LU computed in an iterative
+fashion (3), and iterative incomplete LU single thresholding (4).
+
+{\tt cm\_solver\_pre\_comp\_refactor} sets the number of simulation steps
+after which to recompute the preconditioner.
+
+{\tt cm\_solver\_pre\_comp\_droptol} sets the dropping tolerance for
+computing incompute Cholesky or LU preconditioners.
+
+{\tt cm\_solver\_pre\_comp\_sweeps} sets the number of sweeps (iterations)
+to perform when computing incompute LU iteratively.
+
+{\tt cm\_solver\_pre\_app\_type} determines the type of method used
+to apply the preconditioner in the case of two-sided preconditioning
+(incomplete Cholesky and LU). Specifically, the application of the
+approximate triangular factors requires solving triangular linear systems,
+and this parameter controls how this is performed. Options are serial solve
+forward/backword substitution (0), solve via level scheduling (1), solve
+via graph coloring (3), or approximate solve via Jacobi iteration (4).
+
+{\tt cm\_solver\_pre\_app\_jacobi\_iters} controls how many iterations
+are performed for each approximate triangular solve via Jacobi iteration.
+
+{\tt cm\_solver\_freq} can be used to compute charges at every 
 few steps instead of the default behaviour of performing it at every 
-step. Although doing QEq less frequently would save important 
+step. Although doing this less frequently would save important 
 computational time, it is not recommended. Because this might cause wild 
 fluctuations in energies and forces.
 
+Defaults: 
+
+Notes:
+\begin{enumerate}
+  \item Only the shared memory versions of PuReMD contain all the solvers,
+    while only CG with diagonal inverse preconditioning is contained in the
+    distributed memory versions.
+  \item The full set of preconditioning options is implemented in the shared
+    memory non-GPU version of PuReMD. The other versions contain only diagonal
+    inverse preconditioning.
+\end{enumerate}
+
+\subsubsection{temp\_init}
+\label{sec:temp_init}
+\subsubsection{temp\_final}
+\label{sec:temp_final}
+\subsubsection{t\_mass}
+\label{sec:t_mass}
+
 \begin{verbatim}
   temp_init    0.0
   temp_final   300.0
@@ -307,11 +523,19 @@ is controlled via the {\tt temp\_init} parameter including the NVE ensemble.
 for {\tt temp\_final}. PuReMD features both Berendsen~\cite{ref:berendsen} 
 and Nose-Hoover~\cite{ref:klein} type thermostats as was mentioned while 
 explaining the {\tt ensemble\_type} parameter.
-\emph{Important note: Nose-Hoover thermostat in PuReMD is still under testing.}
 
-{\tt t\_mass} is the thermal inertia given in femtoseconds. Suggested (and 
-the default) value of {\tt t\_mass} is 500.0, and 0.166 for the Berendsen 
-thermostat, and for the Nose-Hoover thermostat, respectively.
+{\tt t\_mass} is the thermal inertia given in femtoseconds. Suggested
+value of {\tt t\_mass} is 500.0 and 0.166 for the Berendsen 
+thermostat and the Nose-Hoover thermostats, respectively.
+
+Defaults: 0.0, 300.0, 0.16666
+
+Note: Nose-Hoover thermostat is still under testing
+
+\subsubsection{pressure}
+\label{sec:pressure}
+\subsubsection{p\_mass}
+\label{sec:p_mass}
 
 \begin{verbatim}
   pressure      0.000101 0.000101 0.000101
@@ -332,23 +556,46 @@ to control pressure. For the sNPT ensemble, {\tt pressure} parameter
 expects 3 floating point numbers to control pressure on each dimension.
 Same things apply for {\tt p\_mass} as well.
 
+Defaults: 0.000101325, 5000.0
+
+\subsubsection{write\_freq}
+\label{sec:write_freq}
+\subsubsection{traj\_format}
+\label{sec:traj_format}
+
 \begin{verbatim}
   write_freq     100
-  traj_method      1
+  traj_format      1
 \end{verbatim}
 Trajectory of the simulation will be output to the trajectory file 
 (which will automatically be named as {\tt simulation\_name.trj}) at 
 every {\tt write\_freq} steps. For making analysis easier, the trajectory 
-file is written as an ASCII file. By default, no trajectory file
-is written.
+file is written as an ASCII file.
 
-PuReMD can output trajectories either using simple MPI send/receives 
+The distributed memory version of PuReMD
+can output trajectories either using simple MPI send/receives 
 (option 0 which is the default) or using MPI I/O calls (option 1) which 
 are part of the MPI-2 standard. The latter option is supposed to be more 
 efficient (not verified by tests though) but may not be available in some 
 MPI implementations. {\tt traj\_method} option is not applicable to 
 SerialReax simulations.
 
+Defaults: 0 (no trajectory file written), 0 (simple I/O)
+
+
+\subsubsection{traj\_title}
+\label{sec:traj_title}
+\subsubsection{atom\_info}
+\label{sec:atom_info}
+\subsubsection{atom\_forces}
+\label{sec:atom_forces}
+\subsubsection{atom\_velocities}
+\label{sec:atom_velocities}
+\subsubsection{bond\_info}
+\label{sec:bond_info}
+\subsubsection{angle\_info}
+\label{sec:angle_info}
+
 \begin{verbatim}
   traj_title          TEST
   atom_info           1
@@ -370,8 +617,7 @@ box geometry is standard. However, the latter parts of the frame can be
 customized using {\tt atom\_info}, {\tt atom\_forces}, {\tt atom\_velocities}, 
 {\tt bond\_info} and {\tt angle\_info} parameters which are already
 self-explanatory. The ordering is atoms section, bonds section and angles 
-section assuming that they are all present. By default, all atom, bond and 
-angle information outputting is turned off.
+section assuming that they are all present.
 
 One nice property of the custom trajectory format is that each part of 
 the trajectory is prepended by a number that can be used to skip that part.
@@ -384,7 +630,7 @@ within a trajectory frame as well, making it easy to skip parts which are
 not of interest to a particular trajectory analysis procedure. So the 
 general layout of our custom trajectory format is as follows (assuming 
 all trajectory options are turned on):
-\begin{verbatim}
+\begin{lstlisting}
 CHARS_TO_SKIP_SECTION
 trajectory header
 CHARS_TO_SKIP_ATOM_DESCS NUM_LINES
@@ -397,9 +643,7 @@ CHARS_TO_SKIP_BOND_LINES NUM_BOND_LINES
 frame1 bond info
 CHARS_TO_SKIP_ANGLE_LINES NUM_ANGLE_LINES
 frame1 angle info
-.
-.
-.
+$\vdots$
 CHARS_TO_SKIP_FRAME_HEADER
 frameN header
 CHARS_TO_SKIP_ATOM_LINES NUM_ATOM_LINES
@@ -408,11 +652,11 @@ CHARS_TO_SKIP_BOND_LINES NUM_BOND_LINES
 frameN bond info
 CHARS_TO_SKIP_ANGLE_LINES NUM_ANGLE_LINES
 frameN angle info
-\end{verbatim}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\end{lstlisting}
+
+Defaults: default\_title, 0, 0, 0, 0, 0 (all off)
 
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{SerialReax Extras}
 \label{sec:serialreax_extras}
 
@@ -420,27 +664,27 @@ In this section, we explain the parameters found in SerialReax but not in
 PuReMD. Our work towards adding the same functionalities into PuReMD is 
 underway.
 
-In addition to the PCG solver, SerialReax features a preconditioned GMRES 
-(PGMRES) solver and an incomplete LU factorization (ILU) based  
-preconditioning scheme. An ILU factorization essentially does the 
-same thing as an LU factorization but small terms in the matrix are dropped
-to expedite the factorization and to prevent a huge number of fill-ins in the
-factor matrices. Following are the extra control parameters found in 
-SerialReax regarding the QEq solver:
-\begin{verbatim}
-  ilu_refactor        100
-  ilu_droptol         0.01
-\end{verbatim}
-{\tt ilu\_droptol} sets the threshold for dropping small terms in the 
-resulting ILU factors. Suggested (and the default) value for 
-{\tt ilu\_droptol} is $10^{-2}$. Despite the drop rules, ILU factorization 
-is still a costly operation. So a user can choose to perform it at 
-every {\tt ilu\_refactor} steps. The fact that atoms move very slowly in an 
-MD simulation allows the use of same ILU factors as preconditioners in the 
-subsequent steps with little performance loss. For liquids, this frequency 
-can be on the order of 100-200 steps, for solids it can go up to thousands 
-of steps depending on how fast atoms are moving. The default for 
-{\tt ilu\_refactor} is 100.
+%In addition to the PCG solver, SerialReax features a preconditioned GMRES 
+%(PGMRES) solver and an incomplete LU factorization (ILU) based  
+%preconditioning scheme. An ILU factorization essentially does the 
+%same thing as an LU factorization but small terms in the matrix are dropped
+%to expedite the factorization and to prevent a huge number of fill-ins in the
+%factor matrices. Following are the extra control parameters found in 
+%SerialReax regarding the QEq solver:
+%\begin{verbatim}
+%  ilu_refactor        100
+%  ilu_droptol         0.01
+%\end{verbatim}
+%{\tt ilu\_droptol} sets the threshold for dropping small terms in the 
+%resulting ILU factors. Suggested (and the default) value for 
+%{\tt ilu\_droptol} is $10^{-2}$. Despite the drop rules, ILU factorization 
+%is still a costly operation. So a user can choose to perform it at 
+%every {\tt ilu\_refactor} steps. The fact that atoms move very slowly in an 
+%MD simulation allows the use of same ILU factors as preconditioners in the 
+%subsequent steps with little performance loss. For liquids, this frequency 
+%can be on the order of 100-200 steps, for solids it can go up to thousands 
+%of steps depending on how fast atoms are moving. The default for 
+%{\tt ilu\_refactor} is 100.
 
 \begin{verbatim}
   t_mode        0
@@ -542,28 +786,56 @@ in molecular analysis.
 \section{Compilation and Execution}
 \label{sec:puremd_execute}
 
-PuReMD is distributed in the {\tt tar.gz} compression format which can 
+PuReMD is distributed in the \mintinline{bash}{tar.gz} compression format which can 
 be extracted under a Unix system with the following command:
-\begin{verbatim}
-  gtar xvzf PuReMD.tar.gz
-\end{verbatim}
-
-This results in the creation of a new directory, named {\tt PuReMD}, will appear in the working 
-directory. It contains the source code directory ({\tt src}) 
-along with a directory for sample systems ({\tt examples}).
-
-PuReMD can be compiled by switching to the {\tt src} directory and 
-running {\tt make}. The executable, {\tt puremd}, will be created inside 
-the source directory. The Makefile that comes in the distribution assumes 
-OpenMPI as the default MPI implementation and {\tt mpicc} as the default 
-MPI compiler. In case you have a different MPI implementation, 
-please set your MPI compiler in the Makefile appropriately. 
+\mint{bash}{tar -xvf PuReMD.tar.gz}
+
+This results in the creation of a new directory, named \mintinline{bash}{PuReMD}, which will
+appear in the working directory. The base directory contains a configure script
+along with the necessary makefiles to compile a particular version of the
+software; these steps are elaborated further below.  (For developers only: see
+the documentation on the Gitlab server for how to generate these files using
+the GNU Autotools) In addition, there is also a directory with several sample
+systems and force field files (\mintinline{bash}{data/benchmarks}), and a
+directory with relevant documentation including this user manual
+({\mintinline{bash}{doc}).
+
+To build PuReMD, you must run the following sequence of commands:
+\begin{minted}{bash}
+  ./configure
+  make
+  make install # optional
+\end{minted}
+
+Upon successful compilation, a binary executable file will be located
+in \mintinline{bash}{*/bin}, where \mintinline{bash}{*} is one of the
+following directories:
+\begin{minted}{bash}
+  sPuReMD    # serial/OpenMP shared memory code
+  PuReMD-GPU # GPU shared memory code
+  PuReMD     # MPI distributed memory code
+  PG-PuReMD  # MPI+GPU shared memory code
+\end{minted}
+
+There are several different versions of PuReMD which can be compiled,
+and these versions can be enabled or disabled via the following options
+to the configure script:
+\begin{minted}{bash}
+  --enable-serial=yes  # build serial shared memory version
+  --enable-openmp=yes  # build OpenMP shared memory version
+  --enable-gpu=yes     # build GPU shared memory version
+  --enable-mpi=yes     # build MPI distributed memory version
+  --enable-mpi-gpu=yes # build MPI+GPU distributed memory version
+\end{minted}
+Furthermore, run \mintinline{bash}{./configure --help} to see the full list
+of variables and options which can be set.
 
 PuReMD requires 3 input files as mentioned in section~\ref{sec:puremd_inp}. 
-For example, the command to run {\tt puremd} with OpenMPI is as follows:
-\begin{verbatim}
-  mpirun -np #p -machinefile m.txt puremd geo ffield control
-\end{verbatim}
+For example, the command to run \mintinline{bash}{puremd} with OpenMPI is as follows:
+\begin{minted}{bash}
+  mpirun -np num_procs -machinefile m.txt PuReMD/bin/puremd
+    path/to/geo path/to/ffield path/to/control
+\end{minted}
 
 SerialReax comes in a similar distribution format and Makefile,
 so instructions for compiling and running PuReMD is applicable for 
@@ -623,7 +895,7 @@ by its unique extension:
 Apart from these, there might be some text printed to \emph{stderr} 
 for debugging purposes. If you encounter some problems with the code
 (like a segmentation fault or unexpected termination of the code),
-please contact \href{mailto:haktulga@cs.purdue.edu}{haktulga@cs.purdue.edu} with the error message 
+please contact \href{mailto:hma@cse.msu.edu}{here} with the error message 
 printed to \emph{stderr} and your input files.
 
 In addition to the output files above, SerialReax can output another
diff --git a/environ/parallel_control b/environ/parallel_control
index d69d3eae3610ae57df68a0a26fafaf4f71bbec2e..cfe9fa5a59d51d4988d71569a19ba5fa52e2958b 100644
--- a/environ/parallel_control
+++ b/environ/parallel_control
@@ -3,41 +3,63 @@ ensemble_type            1                      ! 0: NVE, 1: Berendsen NVT, 2: N
 nsteps                   100                    ! number of simulation steps
 dt                       0.25                   ! time step in fs
 proc_by_dim              1 1 1                  ! distribution of processors by dimensions
+gpus_per_node            1                      ! GPUs per node
 
-reposition_atoms         1                      ! 0: just fit to periodic boundaries, 1: CoM to the center of box, 3: CoM to the origin
+reposition_atoms         0                      ! 0: just fit to periodic boundaries, 1: CoM to the center of box, 3: CoM to the origin
+restrict_bonds           0                      ! enforce the bonds given in CONECT lines of pdb file for this many steps
 tabulate_long_range      0                      ! number of sampling points for cubic spline interpolation, 0 no interpolation
 energy_update_freq       1
 remove_CoM_vel           500                    ! remove the translational and rotational vel around the center of mass at every 'this many' steps
 
 reneighbor               1
 vlist_buffer             0
-nbrhood_cutoff           4.5                    ! near neighbors cutoff for bond calculations (Angstroms)
+nbrhood_cutoff           5.0                    ! near neighbors cutoff for bond calculations (Angstroms)
 bond_graph_cutoff        0.3                    ! bond strength cutoff for bond graphs (Angstroms)
 thb_cutoff               0.001                  ! cutoff value for three body interactions (Angstroms)
-hbond_cutoff             7.5                    ! cutoff distance for hydrogen bond interactions (Angstroms)
+hbond_cutoff             7.50                   ! cutoff distance for hydrogen bond interactions (Angstroms)
 
-qeq_freq                 1                      ! frequency to update charges with QEq
-q_err                    1e-6                   ! norm of the relative residual in QEq solve
+charge_method                     0             ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2
+charge_freq                       1             ! frequency (sim step) at which atomic charges are computed
+cm_q_net                          0.0           ! net system charge
+cm_solver_type                    0             ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM
+cm_solver_max_iters               1000          ! max solver iterations
+cm_solver_restart                 100           ! inner iterations of GMRES before restarting
+cm_solver_q_err                   1e-6          ! relative residual norm threshold used in solver
+cm_domain_sparsity                1.0           ! scalar for scaling cut-off distance, used to sparsify charge matrix (between 0.0 and 1.0)
+cm_solver_pre_comp_type           1             ! method used to compute preconditioner, if applicable
+cm_solver_pre_comp_refactor       1000          ! number of steps before recomputing preconditioner
+cm_solver_pre_comp_droptol        0.0           ! threshold tolerance for dropping values in preconditioner computation, if applicable
+cm_solver_pre_comp_sweeps         3             ! number of sweeps used to compute preconditioner (ILU_PAR)
+cm_solver_pre_app_type            1             ! method used to apply preconditioner
+cm_solver_pre_app_jacobi_iters    50            ! number of Jacobi iterations used for applying precondition, if applicable
 
 temp_init                0.01                   ! desired initial temperature of the simulated system
 temp_final               300.0                  ! desired final temperature of the simulated system
-t_mass                   500.0                  ! 0.16666 for nhNVT ! 500.0 for bNVT, iNPT, sNPT ! in fs, thermal inertia
+t_mass                   500.0                  ! thermal inertia parameter (fs): Nose-Hoover-NVT: 0.16666, NVP: 100.0, bNVT/iNPT/sNPT: 500.0
 t_mode                   2                      ! 0: T-coupling only, 1: step-wise, 2: constant slope
 t_rate                   5.0                    ! in K
 t_freq                   1.0                    ! in ps
 
 pressure                 0.000101325 0.000101325 0.000101325  ! desired pressure of the simulated system in GPa, 1atm = 0.000101325 GPa
 p_mass                   10000.00 10000.00 10000.00           ! in fs, pressure inertia parameter
+compress                 0.008134               ! in ps^2 * A / amu ( 4.5X10^(-5) bar^(-1) )
+press_mode               0                      ! 0: internal + external pressure, 1: ext only, 2: int only
 
 geo_format               1                      ! 0: custom  1: pdb (only if natoms < 100000) 2: ASCII restart 3: binary restart
-write_freq               500                    ! write trajectory after so many steps
+write_freq               0                      ! write trajectory after so many steps
 traj_method              1                      ! 0: simple parallel I/O, 1: MPI I/O
 traj_title               WATER_NVE              ! (no white spaces)
 atom_info                1                      ! 0: no atom info, 1: print basic atom info in the trajectory file
 atom_forces              0                      ! 0: basic atom format, 1: print force on each atom in the trajectory file
-atom_velocities          1                      ! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
+atom_velocities          0                      ! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
 bond_info                1                      ! 0: do not print bonds, 1: print bonds in the trajectory file
 angle_info               1                      ! 0: do not print angles, 1: print angles in the trajectory file 
 
+dipole_anal              0                      ! 1: calculate a electric dipole moment of the system
+freq_dipole_anal         1                      ! calculate electric dipole moment at every 'this many' steps
+diffusion_coef           0                      ! 1: calculate diffusion coefficient of the system
+freq_diffusion_coef      1                      ! calculate diffusion coefficient at every 'this many' steps
+restrict_type            2                      ! -1: all types of atoms, 0 and up: only this type of atoms
+
 restart_format           1                      ! 0: restarts in ASCII  1: restarts in binary
-restart_freq             10000                  ! 0: do not output any restart files. >0: output a restart file at every 'this many' steps
+restart_freq             0                      ! 0: do not output any restart files. >0: output a restart file at every 'this many' steps
diff --git a/environ/param.gpu.water b/environ/param.gpu.water
index 6712d0b605bfc1e3b4817297068e4ddf83cd772b..eed406e7e9e7eaad12d4c1f22e9db0aa166a5b9d 100644
--- a/environ/param.gpu.water
+++ b/environ/param.gpu.water
@@ -14,15 +14,19 @@ bond_graph_cutoff       0.3                     ! bond strength cutoff for bond
 thb_cutoff              0.001                   ! cutoff value for three body interactions (Angstroms)
 hbond_cutoff            7.50                    ! cutoff distance for hydrogen bond interactions (Angstroms)
 
-qeq_solver_type         0                       ! iterative linear solver used for equilibration kernel (QEq)
-qeq_solver_q_err        1e-6                    ! relative residual norm threshold used in solver
-qeq_domain_sparsity     1.0                     ! scalar for scaling cut-off distance, used to sparsify QEq matrix (between 0.0 and 1.0)
-pre_comp_type           1                       ! method used to compute QEq preconditioner, if applicable
-pre_comp_refactor       100                     ! nsteps to recompute preconditioner
-pre_comp_droptol        0.0                     ! threshold tolerance for dropping values in preconditioner computation, if applicable
-pre_comp_sweeps         3                       ! sweeps to compute preconditioner (ILU_PAR)
-pre_app_type            1                       ! method used to apply QEq preconditioner
-pre_app_jacobi_iters    50                      ! number of Jacobi iterations used for applying QEq precondition, if applicable
+charge_method         		  0             ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2
+cm_q_net              		  0.0           ! net system charge
+cm_solver_type        		  0             ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM
+cm_solver_max_iters   		  20            ! max solver iterations
+cm_solver_restart     		  100           ! inner iterations of GMRES before restarting
+cm_solver_q_err       		  1e-6          ! relative residual norm threshold used in solver
+cm_domain_sparsity     		  1.0           ! scalar for scaling cut-off distance, used to sparsify charge matrix (between 0.0 and 1.0)
+cm_solver_pre_comp_type           1             ! method used to compute preconditioner, if applicable
+cm_solver_pre_comp_refactor       1000          ! number of steps before recomputing preconditioner
+cm_solver_pre_comp_droptol        0.0           ! threshold tolerance for dropping values in preconditioner computation, if applicable
+cm_solver_pre_comp_sweeps         3             ! number of sweeps used to compute preconditioner (ILU_PAR)
+cm_solver_pre_app_type            1             ! method used to apply preconditioner
+cm_solver_pre_app_jacobi_iters    50            ! number of Jacobi iterations used for applying precondition, if applicable
 
 temp_init               0.0                     ! desired initial temperature of the simulated system
 temp_final              300.0                   ! desired final temperature of the simulated system
@@ -55,3 +59,6 @@ freq_dipole_anal        1                       ! calculate electric dipole mome
 diffusion_coef          0                       ! 1: calculate diffusion coefficient of the system
 freq_diffusion_coef     1                       ! calculate diffusion coefficient at every 'this many' steps
 restrict_type           2                       ! -1: all types of atoms, 0 and up: only this type of atoms
+
+restart_format          1                       ! 0: restarts in ASCII  1: restarts in binary
+restart_freq            0                       ! 0: do not output any restart files. >0: output a restart file at every 'this many' steps
diff --git a/ltmain.sh b/ltmain.sh
index 63ae69dc6fecaf83c52fba2ad334f4b1369fb1cd..0f0a2da3f9dd10627626bf9725b332d95f314393 100644
--- a/ltmain.sh
+++ b/ltmain.sh
@@ -1,9 +1,12 @@
+#! /bin/sh
+## DO NOT EDIT - This file generated from ./build-aux/ltmain.in
+##               by inline-source v2014-01-03.01
 
-# libtool (GNU libtool) 2.4.2
+# libtool (GNU libtool) 2.4.6
+# Provide generalized library-building support services.
 # Written by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
 
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006,
-# 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+# Copyright (C) 1996-2015 Free Software Foundation, Inc.
 # This is free software; see the source for copying conditions.  There is NO
 # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
@@ -23,881 +26,2112 @@
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with GNU Libtool; see the file COPYING.  If not, a copy
-# can be downloaded from http://www.gnu.org/licenses/gpl.html,
-# or obtained by writing to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-# Usage: $progname [OPTION]... [MODE-ARG]...
-#
-# Provide generalized library-building support services.
-#
-#       --config             show all configuration variables
-#       --debug              enable verbose shell tracing
-#   -n, --dry-run            display commands without modifying any files
-#       --features           display basic configuration information and exit
-#       --mode=MODE          use operation mode MODE
-#       --preserve-dup-deps  don't remove duplicate dependency libraries
-#       --quiet, --silent    don't print informational messages
-#       --no-quiet, --no-silent
-#                            print informational messages (default)
-#       --no-warn            don't display warning messages
-#       --tag=TAG            use configuration variables from tag TAG
-#   -v, --verbose            print more informational messages than default
-#       --no-verbose         don't print the extra informational messages
-#       --version            print version information
-#   -h, --help, --help-all   print short, long, or detailed help message
-#
-# MODE must be one of the following:
-#
-#         clean              remove files from the build directory
-#         compile            compile a source file into a libtool object
-#         execute            automatically set library path, then run a program
-#         finish             complete the installation of libtool libraries
-#         install            install libraries or executables
-#         link               create a library or an executable
-#         uninstall          remove libraries from an installed directory
-#
-# MODE-ARGS vary depending on the MODE.  When passed as first option,
-# `--mode=MODE' may be abbreviated as `MODE' or a unique abbreviation of that.
-# Try `$progname --help --mode=MODE' for a more detailed description of MODE.
-#
-# When reporting a bug, please describe a test case to reproduce it and
-# include the following information:
-#
-#         host-triplet:	$host
-#         shell:		$SHELL
-#         compiler:		$LTCC
-#         compiler flags:		$LTCFLAGS
-#         linker:		$LD (gnu? $with_gnu_ld)
-#         $progname:	(GNU libtool) 2.4.2
-#         automake:	$automake_version
-#         autoconf:	$autoconf_version
-#
-# Report bugs to <bug-libtool@gnu.org>.
-# GNU libtool home page: <http://www.gnu.org/software/libtool/>.
-# General help using GNU software: <http://www.gnu.org/gethelp/>.
 
 PROGRAM=libtool
 PACKAGE=libtool
-VERSION=2.4.2
-TIMESTAMP=""
-package_revision=1.3337
+VERSION=2.4.6
+package_revision=2.4.6
 
-# Be Bourne compatible
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+
+## ------ ##
+## Usage. ##
+## ------ ##
+
+# Run './libtool --help' for help with using this script from the
+# command line.
+
+
+## ------------------------------- ##
+## User overridable command paths. ##
+## ------------------------------- ##
+
+# After configure completes, it has a better idea of some of the
+# shell tools we need than the defaults used by the functions shared
+# with bootstrap, so set those here where they can still be over-
+# ridden by the user, but otherwise take precedence.
+
+: ${AUTOCONF="autoconf"}
+: ${AUTOMAKE="automake"}
+
+
+## -------------------------- ##
+## Source external libraries. ##
+## -------------------------- ##
+
+# Much of our low-level functionality needs to be sourced from external
+# libraries, which are installed to $pkgauxdir.
+
+# Set a version string for this script.
+scriptversion=2015-01-20.17; # UTC
+
+# General shell script boiler plate, and helper functions.
+# Written by Gary V. Vaughan, 2004
+
+# Copyright (C) 2004-2015 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions.  There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# As a special exception to the GNU General Public License, if you distribute
+# this file as part of a program or library that is built using GNU Libtool,
+# you may include this file under the same distribution terms that you use
+# for the rest of that program.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNES FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Please report bugs or propose patches to gary@gnu.org.
+
+
+## ------ ##
+## Usage. ##
+## ------ ##
+
+# Evaluate this file near the top of your script to gain access to
+# the functions and variables defined here:
+#
+#   . `echo "$0" | ${SED-sed} 's|[^/]*$||'`/build-aux/funclib.sh
+#
+# If you need to override any of the default environment variable
+# settings, do that before evaluating this file.
+
+
+## -------------------- ##
+## Shell normalisation. ##
+## -------------------- ##
+
+# Some shells need a little help to be as Bourne compatible as possible.
+# Before doing anything else, make sure all that help has been provided!
+
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
   emulate sh
   NULLCMD=:
-  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
   # is contrary to our usage.  Disable this feature.
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
 else
-  case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
+  case `(set -o) 2>/dev/null` in *posix*) set -o posix ;; esac
 fi
-BIN_SH=xpg4; export BIN_SH # for Tru64
-DUALCASE=1; export DUALCASE # for MKS sh
-
-# A function that is used when there is no print builtin or printf.
-func_fallback_echo ()
-{
-  eval 'cat <<_LTECHO_EOF
-$1
-_LTECHO_EOF'
-}
 
-# NLS nuisances: We save the old values to restore during execute mode.
-lt_user_locale=
-lt_safe_locale=
-for lt_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
+# NLS nuisances: We save the old values in case they are required later.
+_G_user_locale=
+_G_safe_locale=
+for _G_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
 do
-  eval "if test \"\${$lt_var+set}\" = set; then
-          save_$lt_var=\$$lt_var
-          $lt_var=C
-	  export $lt_var
-	  lt_user_locale=\"$lt_var=\\\$save_\$lt_var; \$lt_user_locale\"
-	  lt_safe_locale=\"$lt_var=C; \$lt_safe_locale\"
+  eval "if test set = \"\${$_G_var+set}\"; then
+          save_$_G_var=\$$_G_var
+          $_G_var=C
+	  export $_G_var
+	  _G_user_locale=\"$_G_var=\\\$save_\$_G_var; \$_G_user_locale\"
+	  _G_safe_locale=\"$_G_var=C; \$_G_safe_locale\"
 	fi"
 done
-LC_ALL=C
-LANGUAGE=C
-export LANGUAGE LC_ALL
 
-$lt_unset CDPATH
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
 
+# Make sure IFS has a sensible default
+sp=' '
+nl='
+'
+IFS="$sp	$nl"
+
+# There are apparently some retarded systems that use ';' as a PATH separator!
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
 
-# Work around backward compatibility issue on IRIX 6.5. On IRIX 6.4+, sh
-# is ksh but when the shell is invoked as "sh" and the current value of
-# the _XPG environment variable is not equal to 1 (one), the special
-# positional parameter $0, within a function call, is the name of the
-# function.
-progpath="$0"
 
 
+## ------------------------- ##
+## Locate command utilities. ##
+## ------------------------- ##
+
+
+# func_executable_p FILE
+# ----------------------
+# Check that FILE is an executable regular file.
+func_executable_p ()
+{
+    test -f "$1" && test -x "$1"
+}
+
+
+# func_path_progs PROGS_LIST CHECK_FUNC [PATH]
+# --------------------------------------------
+# Search for either a program that responds to --version with output
+# containing "GNU", or else returned by CHECK_FUNC otherwise, by
+# trying all the directories in PATH with each of the elements of
+# PROGS_LIST.
+#
+# CHECK_FUNC should accept the path to a candidate program, and
+# set $func_check_prog_result if it truncates its output less than
+# $_G_path_prog_max characters.
+func_path_progs ()
+{
+    _G_progs_list=$1
+    _G_check_func=$2
+    _G_PATH=${3-"$PATH"}
+
+    _G_path_prog_max=0
+    _G_path_prog_found=false
+    _G_save_IFS=$IFS; IFS=${PATH_SEPARATOR-:}
+    for _G_dir in $_G_PATH; do
+      IFS=$_G_save_IFS
+      test -z "$_G_dir" && _G_dir=.
+      for _G_prog_name in $_G_progs_list; do
+        for _exeext in '' .EXE; do
+          _G_path_prog=$_G_dir/$_G_prog_name$_exeext
+          func_executable_p "$_G_path_prog" || continue
+          case `"$_G_path_prog" --version 2>&1` in
+            *GNU*) func_path_progs_result=$_G_path_prog _G_path_prog_found=: ;;
+            *)     $_G_check_func $_G_path_prog
+		   func_path_progs_result=$func_check_prog_result
+		   ;;
+          esac
+          $_G_path_prog_found && break 3
+        done
+      done
+    done
+    IFS=$_G_save_IFS
+    test -z "$func_path_progs_result" && {
+      echo "no acceptable sed could be found in \$PATH" >&2
+      exit 1
+    }
+}
+
+
+# We want to be able to use the functions in this file before configure
+# has figured out where the best binaries are kept, which means we have
+# to search for them ourselves - except when the results are already set
+# where we skip the searches.
+
+# Unless the user overrides by setting SED, search the path for either GNU
+# sed, or the sed that truncates its output the least.
+test -z "$SED" && {
+  _G_sed_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/
+  for _G_i in 1 2 3 4 5 6 7; do
+    _G_sed_script=$_G_sed_script$nl$_G_sed_script
+  done
+  echo "$_G_sed_script" 2>/dev/null | sed 99q >conftest.sed
+  _G_sed_script=
+
+  func_check_prog_sed ()
+  {
+    _G_path_prog=$1
+
+    _G_count=0
+    printf 0123456789 >conftest.in
+    while :
+    do
+      cat conftest.in conftest.in >conftest.tmp
+      mv conftest.tmp conftest.in
+      cp conftest.in conftest.nl
+      echo '' >> conftest.nl
+      "$_G_path_prog" -f conftest.sed <conftest.nl >conftest.out 2>/dev/null || break
+      diff conftest.out conftest.nl >/dev/null 2>&1 || break
+      _G_count=`expr $_G_count + 1`
+      if test "$_G_count" -gt "$_G_path_prog_max"; then
+        # Best one so far, save it but keep looking for a better one
+        func_check_prog_result=$_G_path_prog
+        _G_path_prog_max=$_G_count
+      fi
+      # 10*(2^10) chars as input seems more than enough
+      test 10 -lt "$_G_count" && break
+    done
+    rm -f conftest.in conftest.tmp conftest.nl conftest.out
+  }
+
+  func_path_progs "sed gsed" func_check_prog_sed $PATH:/usr/xpg4/bin
+  rm -f conftest.sed
+  SED=$func_path_progs_result
+}
+
+
+# Unless the user overrides by setting GREP, search the path for either GNU
+# grep, or the grep that truncates its output the least.
+test -z "$GREP" && {
+  func_check_prog_grep ()
+  {
+    _G_path_prog=$1
+
+    _G_count=0
+    _G_path_prog_max=0
+    printf 0123456789 >conftest.in
+    while :
+    do
+      cat conftest.in conftest.in >conftest.tmp
+      mv conftest.tmp conftest.in
+      cp conftest.in conftest.nl
+      echo 'GREP' >> conftest.nl
+      "$_G_path_prog" -e 'GREP$' -e '-(cannot match)-' <conftest.nl >conftest.out 2>/dev/null || break
+      diff conftest.out conftest.nl >/dev/null 2>&1 || break
+      _G_count=`expr $_G_count + 1`
+      if test "$_G_count" -gt "$_G_path_prog_max"; then
+        # Best one so far, save it but keep looking for a better one
+        func_check_prog_result=$_G_path_prog
+        _G_path_prog_max=$_G_count
+      fi
+      # 10*(2^10) chars as input seems more than enough
+      test 10 -lt "$_G_count" && break
+    done
+    rm -f conftest.in conftest.tmp conftest.nl conftest.out
+  }
+
+  func_path_progs "grep ggrep" func_check_prog_grep $PATH:/usr/xpg4/bin
+  GREP=$func_path_progs_result
+}
+
+
+## ------------------------------- ##
+## User overridable command paths. ##
+## ------------------------------- ##
+
+# All uppercase variable names are used for environment variables.  These
+# variables can be overridden by the user before calling a script that
+# uses them if a suitable command of that name is not already available
+# in the command search PATH.
 
 : ${CP="cp -f"}
-test "${ECHO+set}" = set || ECHO=${as_echo-'printf %s\n'}
+: ${ECHO="printf %s\n"}
+: ${EGREP="$GREP -E"}
+: ${FGREP="$GREP -F"}
+: ${LN_S="ln -s"}
 : ${MAKE="make"}
 : ${MKDIR="mkdir"}
 : ${MV="mv -f"}
 : ${RM="rm -f"}
 : ${SHELL="${CONFIG_SHELL-/bin/sh}"}
-: ${Xsed="$SED -e 1s/^X//"}
-
-# Global variables:
-EXIT_SUCCESS=0
-EXIT_FAILURE=1
-EXIT_MISMATCH=63  # $? = 63 is used to indicate version mismatch to missing.
-EXIT_SKIP=77	  # $? = 77 is used to indicate a skipped test to automake.
-
-exit_status=$EXIT_SUCCESS
-
-# Make sure IFS has a sensible default
-lt_nl='
-'
-IFS=" 	$lt_nl"
 
-dirname="s,/[^/]*$,,"
-basename="s,^.*/,,"
 
-# func_dirname file append nondir_replacement
-# Compute the dirname of FILE.  If nonempty, add APPEND to the result,
-# otherwise set result to NONDIR_REPLACEMENT.
-func_dirname ()
-{
-    func_dirname_result=`$ECHO "${1}" | $SED "$dirname"`
-    if test "X$func_dirname_result" = "X${1}"; then
-      func_dirname_result="${3}"
-    else
-      func_dirname_result="$func_dirname_result${2}"
-    fi
-} # func_dirname may be replaced by extended shell implementation
+## -------------------- ##
+## Useful sed snippets. ##
+## -------------------- ##
 
+sed_dirname='s|/[^/]*$||'
+sed_basename='s|^.*/||'
 
-# func_basename file
-func_basename ()
-{
-    func_basename_result=`$ECHO "${1}" | $SED "$basename"`
-} # func_basename may be replaced by extended shell implementation
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+sed_quote_subst='s|\([`"$\\]\)|\\\1|g'
 
+# Same as above, but do not quote variable references.
+sed_double_quote_subst='s/\(["`\\]\)/\\\1/g'
 
-# func_dirname_and_basename file append nondir_replacement
-# perform func_basename and func_dirname in a single function
-# call:
-#   dirname:  Compute the dirname of FILE.  If nonempty,
-#             add APPEND to the result, otherwise set result
-#             to NONDIR_REPLACEMENT.
-#             value returned in "$func_dirname_result"
-#   basename: Compute filename of FILE.
-#             value retuned in "$func_basename_result"
-# Implementation must be kept synchronized with func_dirname
-# and func_basename. For efficiency, we do not delegate to
-# those functions but instead duplicate the functionality here.
-func_dirname_and_basename ()
-{
-    # Extract subdirectory from the argument.
-    func_dirname_result=`$ECHO "${1}" | $SED -e "$dirname"`
-    if test "X$func_dirname_result" = "X${1}"; then
-      func_dirname_result="${3}"
-    else
-      func_dirname_result="$func_dirname_result${2}"
-    fi
-    func_basename_result=`$ECHO "${1}" | $SED -e "$basename"`
-} # func_dirname_and_basename may be replaced by extended shell implementation
+# Sed substitution that turns a string into a regex matching for the
+# string literally.
+sed_make_literal_regex='s|[].[^$\\*\/]|\\&|g'
 
+# Sed substitution that converts a w32 file name or path
+# that contains forward slashes, into one that contains
+# (escaped) backslashes.  A very naive implementation.
+sed_naive_backslashify='s|\\\\*|\\|g;s|/|\\|g;s|\\|\\\\|g'
+
+# Re-'\' parameter expansions in output of sed_double_quote_subst that
+# were '\'-ed in input to the same.  If an odd number of '\' preceded a
+# '$' in input to sed_double_quote_subst, that '$' was protected from
+# expansion.  Since each input '\' is now two '\'s, look for any number
+# of runs of four '\'s followed by two '\'s and then a '$'.  '\' that '$'.
+_G_bs='\\'
+_G_bs2='\\\\'
+_G_bs4='\\\\\\\\'
+_G_dollar='\$'
+sed_double_backslash="\
+  s/$_G_bs4/&\\
+/g
+  s/^$_G_bs2$_G_dollar/$_G_bs&/
+  s/\\([^$_G_bs]\\)$_G_bs2$_G_dollar/\\1$_G_bs2$_G_bs$_G_dollar/g
+  s/\n//g"
 
-# func_stripname prefix suffix name
-# strip PREFIX and SUFFIX off of NAME.
-# PREFIX and SUFFIX must not contain globbing or regex special
-# characters, hashes, percent signs, but SUFFIX may contain a leading
-# dot (in which case that matches only a dot).
-# func_strip_suffix prefix name
-func_stripname ()
-{
-    case ${2} in
-      .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
-      *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
-    esac
-} # func_stripname may be replaced by extended shell implementation
 
+## ----------------- ##
+## Global variables. ##
+## ----------------- ##
 
-# These SED scripts presuppose an absolute path with a trailing slash.
-pathcar='s,^/\([^/]*\).*$,\1,'
-pathcdr='s,^/[^/]*,,'
-removedotparts=':dotsl
-		s@/\./@/@g
-		t dotsl
-		s,/\.$,/,'
-collapseslashes='s@/\{1,\}@/@g'
-finalslash='s,/*$,/,'
+# Except for the global variables explicitly listed below, the following
+# functions in the '^func_' namespace, and the '^require_' namespace
+# variables initialised in the 'Resource management' section, sourcing
+# this file will not pollute your global namespace with anything
+# else. There's no portable way to scope variables in Bourne shell
+# though, so actually running these functions will sometimes place
+# results into a variable named after the function, and often use
+# temporary variables in the '^_G_' namespace. If you are careful to
+# avoid using those namespaces casually in your sourcing script, things
+# should continue to work as you expect. And, of course, you can freely
+# overwrite any of the functions or variables defined here before
+# calling anything to customize them.
 
-# func_normal_abspath PATH
-# Remove doubled-up and trailing slashes, "." path components,
-# and cancel out any ".." path components in PATH after making
-# it an absolute path.
-#             value returned in "$func_normal_abspath_result"
-func_normal_abspath ()
-{
-  # Start from root dir and reassemble the path.
-  func_normal_abspath_result=
-  func_normal_abspath_tpath=$1
-  func_normal_abspath_altnamespace=
-  case $func_normal_abspath_tpath in
-    "")
-      # Empty path, that just means $cwd.
-      func_stripname '' '/' "`pwd`"
-      func_normal_abspath_result=$func_stripname_result
-      return
-    ;;
-    # The next three entries are used to spot a run of precisely
-    # two leading slashes without using negated character classes;
-    # we take advantage of case's first-match behaviour.
-    ///*)
-      # Unusual form of absolute path, do nothing.
-    ;;
-    //*)
-      # Not necessarily an ordinary path; POSIX reserves leading '//'
-      # and for example Cygwin uses it to access remote file shares
-      # over CIFS/SMB, so we conserve a leading double slash if found.
-      func_normal_abspath_altnamespace=/
-    ;;
-    /*)
-      # Absolute path, do nothing.
-    ;;
-    *)
-      # Relative path, prepend $cwd.
-      func_normal_abspath_tpath=`pwd`/$func_normal_abspath_tpath
-    ;;
-  esac
-  # Cancel out all the simple stuff to save iterations.  We also want
-  # the path to end with a slash for ease of parsing, so make sure
-  # there is one (and only one) here.
-  func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
-        -e "$removedotparts" -e "$collapseslashes" -e "$finalslash"`
-  while :; do
-    # Processed it all yet?
-    if test "$func_normal_abspath_tpath" = / ; then
-      # If we ascended to the root using ".." the result may be empty now.
-      if test -z "$func_normal_abspath_result" ; then
-        func_normal_abspath_result=/
-      fi
-      break
-    fi
-    func_normal_abspath_tcomponent=`$ECHO "$func_normal_abspath_tpath" | $SED \
-        -e "$pathcar"`
-    func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
-        -e "$pathcdr"`
-    # Figure out what to do with it
-    case $func_normal_abspath_tcomponent in
-      "")
-        # Trailing empty path component, ignore it.
-      ;;
-      ..)
-        # Parent dir; strip last assembled component from result.
-        func_dirname "$func_normal_abspath_result"
-        func_normal_abspath_result=$func_dirname_result
-      ;;
-      *)
-        # Actual path component, append it.
-        func_normal_abspath_result=$func_normal_abspath_result/$func_normal_abspath_tcomponent
-      ;;
-    esac
-  done
-  # Restore leading double-slash if one was found on entry.
-  func_normal_abspath_result=$func_normal_abspath_altnamespace$func_normal_abspath_result
-}
+EXIT_SUCCESS=0
+EXIT_FAILURE=1
+EXIT_MISMATCH=63  # $? = 63 is used to indicate version mismatch to missing.
+EXIT_SKIP=77	  # $? = 77 is used to indicate a skipped test to automake.
 
-# func_relative_path SRCDIR DSTDIR
-# generates a relative path from SRCDIR to DSTDIR, with a trailing
-# slash if non-empty, suitable for immediately appending a filename
-# without needing to append a separator.
-#             value returned in "$func_relative_path_result"
-func_relative_path ()
-{
-  func_relative_path_result=
-  func_normal_abspath "$1"
-  func_relative_path_tlibdir=$func_normal_abspath_result
-  func_normal_abspath "$2"
-  func_relative_path_tbindir=$func_normal_abspath_result
-
-  # Ascend the tree starting from libdir
-  while :; do
-    # check if we have found a prefix of bindir
-    case $func_relative_path_tbindir in
-      $func_relative_path_tlibdir)
-        # found an exact match
-        func_relative_path_tcancelled=
-        break
-        ;;
-      $func_relative_path_tlibdir*)
-        # found a matching prefix
-        func_stripname "$func_relative_path_tlibdir" '' "$func_relative_path_tbindir"
-        func_relative_path_tcancelled=$func_stripname_result
-        if test -z "$func_relative_path_result"; then
-          func_relative_path_result=.
-        fi
-        break
-        ;;
-      *)
-        func_dirname $func_relative_path_tlibdir
-        func_relative_path_tlibdir=${func_dirname_result}
-        if test "x$func_relative_path_tlibdir" = x ; then
-          # Have to descend all the way to the root!
-          func_relative_path_result=../$func_relative_path_result
-          func_relative_path_tcancelled=$func_relative_path_tbindir
-          break
-        fi
-        func_relative_path_result=../$func_relative_path_result
-        ;;
-    esac
-  done
+# Allow overriding, eg assuming that you follow the convention of
+# putting '$debug_cmd' at the start of all your functions, you can get
+# bash to show function call trace with:
+#
+#    debug_cmd='eval echo "${FUNCNAME[0]} $*" >&2' bash your-script-name
+debug_cmd=${debug_cmd-":"}
+exit_cmd=:
 
-  # Now calculate path; take care to avoid doubling-up slashes.
-  func_stripname '' '/' "$func_relative_path_result"
-  func_relative_path_result=$func_stripname_result
-  func_stripname '/' '/' "$func_relative_path_tcancelled"
-  if test "x$func_stripname_result" != x ; then
-    func_relative_path_result=${func_relative_path_result}/${func_stripname_result}
-  fi
+# By convention, finish your script with:
+#
+#    exit $exit_status
+#
+# so that you can set exit_status to non-zero if you want to indicate
+# something went wrong during execution without actually bailing out at
+# the point of failure.
+exit_status=$EXIT_SUCCESS
 
-  # Normalisation. If bindir is libdir, return empty string,
-  # else relative path ending with a slash; either way, target
-  # file name can be directly appended.
-  if test ! -z "$func_relative_path_result"; then
-    func_stripname './' '' "$func_relative_path_result/"
-    func_relative_path_result=$func_stripname_result
-  fi
-}
+# Work around backward compatibility issue on IRIX 6.5. On IRIX 6.4+, sh
+# is ksh but when the shell is invoked as "sh" and the current value of
+# the _XPG environment variable is not equal to 1 (one), the special
+# positional parameter $0, within a function call, is the name of the
+# function.
+progpath=$0
 
-# The name of this program:
-func_dirname_and_basename "$progpath"
-progname=$func_basename_result
+# The name of this program.
+progname=`$ECHO "$progpath" |$SED "$sed_basename"`
 
-# Make sure we have an absolute path for reexecution:
+# Make sure we have an absolute progpath for reexecution:
 case $progpath in
   [\\/]*|[A-Za-z]:\\*) ;;
   *[\\/]*)
-     progdir=$func_dirname_result
+     progdir=`$ECHO "$progpath" |$SED "$sed_dirname"`
      progdir=`cd "$progdir" && pwd`
-     progpath="$progdir/$progname"
+     progpath=$progdir/$progname
      ;;
   *)
-     save_IFS="$IFS"
+     _G_IFS=$IFS
      IFS=${PATH_SEPARATOR-:}
      for progdir in $PATH; do
-       IFS="$save_IFS"
+       IFS=$_G_IFS
        test -x "$progdir/$progname" && break
      done
-     IFS="$save_IFS"
+     IFS=$_G_IFS
      test -n "$progdir" || progdir=`pwd`
-     progpath="$progdir/$progname"
+     progpath=$progdir/$progname
      ;;
 esac
 
-# Sed substitution that helps us do robust quoting.  It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed="${SED}"' -e 1s/^X//'
-sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
-
-# Same as above, but do not quote variable references.
-double_quote_subst='s/\(["`\\]\)/\\\1/g'
 
-# Sed substitution that turns a string into a regex matching for the
-# string literally.
-sed_make_literal_regex='s,[].[^$\\*\/],\\&,g'
+## ----------------- ##
+## Standard options. ##
+## ----------------- ##
 
-# Sed substitution that converts a w32 file name or path
-# which contains forward slashes, into one that contains
-# (escaped) backslashes.  A very naive implementation.
-lt_sed_naive_backslashify='s|\\\\*|\\|g;s|/|\\|g;s|\\|\\\\|g'
-
-# Re-`\' parameter expansions in output of double_quote_subst that were
-# `\'-ed in input to the same.  If an odd number of `\' preceded a '$'
-# in input to double_quote_subst, that '$' was protected from expansion.
-# Since each input `\' is now two `\'s, look for any number of runs of
-# four `\'s followed by two `\'s and then a '$'.  `\' that '$'.
-bs='\\'
-bs2='\\\\'
-bs4='\\\\\\\\'
-dollar='\$'
-sed_double_backslash="\
-  s/$bs4/&\\
-/g
-  s/^$bs2$dollar/$bs&/
-  s/\\([^$bs]\\)$bs2$dollar/\\1$bs2$bs$dollar/g
-  s/\n//g"
+# The following options affect the operation of the functions defined
+# below, and should be set appropriately depending on run-time para-
+# meters passed on the command line.
 
-# Standard options:
 opt_dry_run=false
-opt_help=false
 opt_quiet=false
 opt_verbose=false
-opt_warning=:
 
-# func_echo arg...
-# Echo program name prefixed message, along with the current mode
-# name if it has been set yet.
-func_echo ()
-{
-    $ECHO "$progname: ${opt_mode+$opt_mode: }$*"
-}
+# Categories 'all' and 'none' are always available.  Append any others
+# you will pass as the first argument to func_warning from your own
+# code.
+warning_categories=
 
-# func_verbose arg...
-# Echo program name prefixed message in verbose mode only.
-func_verbose ()
-{
-    $opt_verbose && func_echo ${1+"$@"}
+# By default, display warnings according to 'opt_warning_types'.  Set
+# 'warning_func'  to ':' to elide all warnings, or func_fatal_error to
+# treat the next displayed warning as a fatal error.
+warning_func=func_warn_and_continue
 
-    # A bug in bash halts the script if the last line of a function
-    # fails when set -e is in force, so we need another command to
-    # work around that:
-    :
-}
+# Set to 'all' to display all warnings, 'none' to suppress all
+# warnings, or a space delimited list of some subset of
+# 'warning_categories' to display only the listed warnings.
+opt_warning_types=all
 
-# func_echo_all arg...
-# Invoke $ECHO with all args, space-separated.
-func_echo_all ()
-{
-    $ECHO "$*"
-}
 
-# func_error arg...
-# Echo program name prefixed message to standard error.
-func_error ()
-{
-    $ECHO "$progname: ${opt_mode+$opt_mode: }"${1+"$@"} 1>&2
-}
+## -------------------- ##
+## Resource management. ##
+## -------------------- ##
 
-# func_warning arg...
-# Echo program name prefixed warning message to standard error.
-func_warning ()
-{
-    $opt_warning && $ECHO "$progname: ${opt_mode+$opt_mode: }warning: "${1+"$@"} 1>&2
+# This section contains definitions for functions that each ensure a
+# particular resource (a file, or a non-empty configuration variable for
+# example) is available, and if appropriate to extract default values
+# from pertinent package files. Call them using their associated
+# 'require_*' variable to ensure that they are executed, at most, once.
+#
+# It's entirely deliberate that calling these functions can set
+# variables that don't obey the namespace limitations obeyed by the rest
+# of this file, in order that that they be as useful as possible to
+# callers.
 
-    # bash bug again:
-    :
-}
 
-# func_fatal_error arg...
-# Echo program name prefixed message to standard error, and exit.
-func_fatal_error ()
+# require_term_colors
+# -------------------
+# Allow display of bold text on terminals that support it.
+require_term_colors=func_require_term_colors
+func_require_term_colors ()
 {
-    func_error ${1+"$@"}
-    exit $EXIT_FAILURE
-}
+    $debug_cmd
+
+    test -t 1 && {
+      # COLORTERM and USE_ANSI_COLORS environment variables take
+      # precedence, because most terminfo databases neglect to describe
+      # whether color sequences are supported.
+      test -n "${COLORTERM+set}" && : ${USE_ANSI_COLORS="1"}
+
+      if test 1 = "$USE_ANSI_COLORS"; then
+        # Standard ANSI escape sequences
+        tc_reset='[0m'
+        tc_bold='[1m';   tc_standout='[7m'
+        tc_red='[31m';   tc_green='[32m'
+        tc_blue='[34m';  tc_cyan='[36m'
+      else
+        # Otherwise trust the terminfo database after all.
+        test -n "`tput sgr0 2>/dev/null`" && {
+          tc_reset=`tput sgr0`
+          test -n "`tput bold 2>/dev/null`" && tc_bold=`tput bold`
+          tc_standout=$tc_bold
+          test -n "`tput smso 2>/dev/null`" && tc_standout=`tput smso`
+          test -n "`tput setaf 1 2>/dev/null`" && tc_red=`tput setaf 1`
+          test -n "`tput setaf 2 2>/dev/null`" && tc_green=`tput setaf 2`
+          test -n "`tput setaf 4 2>/dev/null`" && tc_blue=`tput setaf 4`
+          test -n "`tput setaf 5 2>/dev/null`" && tc_cyan=`tput setaf 5`
+        }
+      fi
+    }
 
-# func_fatal_help arg...
-# Echo program name prefixed message to standard error, followed by
-# a help hint, and exit.
-func_fatal_help ()
-{
-    func_error ${1+"$@"}
-    func_fatal_error "$help"
+    require_term_colors=:
 }
-help="Try \`$progname --help' for more information."  ## default
 
 
-# func_grep expression filename
+## ----------------- ##
+## Function library. ##
+## ----------------- ##
+
+# This section contains a variety of useful functions to call in your
+# scripts. Take note of the portable wrappers for features provided by
+# some modern shells, which will fall back to slower equivalents on
+# less featureful shells.
+
+
+# func_append VAR VALUE
+# ---------------------
+# Append VALUE onto the existing contents of VAR.
+
+  # We should try to minimise forks, especially on Windows where they are
+  # unreasonably slow, so skip the feature probes when bash or zsh are
+  # being used:
+  if test set = "${BASH_VERSION+set}${ZSH_VERSION+set}"; then
+    : ${_G_HAVE_ARITH_OP="yes"}
+    : ${_G_HAVE_XSI_OPS="yes"}
+    # The += operator was introduced in bash 3.1
+    case $BASH_VERSION in
+      [12].* | 3.0 | 3.0*) ;;
+      *)
+        : ${_G_HAVE_PLUSEQ_OP="yes"}
+        ;;
+    esac
+  fi
+
+  # _G_HAVE_PLUSEQ_OP
+  # Can be empty, in which case the shell is probed, "yes" if += is
+  # useable or anything else if it does not work.
+  test -z "$_G_HAVE_PLUSEQ_OP" \
+    && (eval 'x=a; x+=" b"; test "a b" = "$x"') 2>/dev/null \
+    && _G_HAVE_PLUSEQ_OP=yes
+
+if test yes = "$_G_HAVE_PLUSEQ_OP"
+then
+  # This is an XSI compatible shell, allowing a faster implementation...
+  eval 'func_append ()
+  {
+    $debug_cmd
+
+    eval "$1+=\$2"
+  }'
+else
+  # ...otherwise fall back to using expr, which is often a shell builtin.
+  func_append ()
+  {
+    $debug_cmd
+
+    eval "$1=\$$1\$2"
+  }
+fi
+
+
+# func_append_quoted VAR VALUE
+# ----------------------------
+# Quote VALUE and append to the end of shell variable VAR, separated
+# by a space.
+if test yes = "$_G_HAVE_PLUSEQ_OP"; then
+  eval 'func_append_quoted ()
+  {
+    $debug_cmd
+
+    func_quote_for_eval "$2"
+    eval "$1+=\\ \$func_quote_for_eval_result"
+  }'
+else
+  func_append_quoted ()
+  {
+    $debug_cmd
+
+    func_quote_for_eval "$2"
+    eval "$1=\$$1\\ \$func_quote_for_eval_result"
+  }
+fi
+
+
+# func_append_uniq VAR VALUE
+# --------------------------
+# Append unique VALUE onto the existing contents of VAR, assuming
+# entries are delimited by the first character of VALUE.  For example:
+#
+#   func_append_uniq options " --another-option option-argument"
+#
+# will only append to $options if " --another-option option-argument "
+# is not already present somewhere in $options already (note spaces at
+# each end implied by leading space in second argument).
+func_append_uniq ()
+{
+    $debug_cmd
+
+    eval _G_current_value='`$ECHO $'$1'`'
+    _G_delim=`expr "$2" : '\(.\)'`
+
+    case $_G_delim$_G_current_value$_G_delim in
+      *"$2$_G_delim"*) ;;
+      *) func_append "$@" ;;
+    esac
+}
+
+
+# func_arith TERM...
+# ------------------
+# Set func_arith_result to the result of evaluating TERMs.
+  test -z "$_G_HAVE_ARITH_OP" \
+    && (eval 'test 2 = $(( 1 + 1 ))') 2>/dev/null \
+    && _G_HAVE_ARITH_OP=yes
+
+if test yes = "$_G_HAVE_ARITH_OP"; then
+  eval 'func_arith ()
+  {
+    $debug_cmd
+
+    func_arith_result=$(( $* ))
+  }'
+else
+  func_arith ()
+  {
+    $debug_cmd
+
+    func_arith_result=`expr "$@"`
+  }
+fi
+
+
+# func_basename FILE
+# ------------------
+# Set func_basename_result to FILE with everything up to and including
+# the last / stripped.
+if test yes = "$_G_HAVE_XSI_OPS"; then
+  # If this shell supports suffix pattern removal, then use it to avoid
+  # forking. Hide the definitions single quotes in case the shell chokes
+  # on unsupported syntax...
+  _b='func_basename_result=${1##*/}'
+  _d='case $1 in
+        */*) func_dirname_result=${1%/*}$2 ;;
+        *  ) func_dirname_result=$3        ;;
+      esac'
+
+else
+  # ...otherwise fall back to using sed.
+  _b='func_basename_result=`$ECHO "$1" |$SED "$sed_basename"`'
+  _d='func_dirname_result=`$ECHO "$1"  |$SED "$sed_dirname"`
+      if test "X$func_dirname_result" = "X$1"; then
+        func_dirname_result=$3
+      else
+        func_append func_dirname_result "$2"
+      fi'
+fi
+
+eval 'func_basename ()
+{
+    $debug_cmd
+
+    '"$_b"'
+}'
+
+
+# func_dirname FILE APPEND NONDIR_REPLACEMENT
+# -------------------------------------------
+# Compute the dirname of FILE.  If nonempty, add APPEND to the result,
+# otherwise set result to NONDIR_REPLACEMENT.
+eval 'func_dirname ()
+{
+    $debug_cmd
+
+    '"$_d"'
+}'
+
+
+# func_dirname_and_basename FILE APPEND NONDIR_REPLACEMENT
+# --------------------------------------------------------
+# Perform func_basename and func_dirname in a single function
+# call:
+#   dirname:  Compute the dirname of FILE.  If nonempty,
+#             add APPEND to the result, otherwise set result
+#             to NONDIR_REPLACEMENT.
+#             value returned in "$func_dirname_result"
+#   basename: Compute filename of FILE.
+#             value retuned in "$func_basename_result"
+# For efficiency, we do not delegate to the functions above but instead
+# duplicate the functionality here.
+eval 'func_dirname_and_basename ()
+{
+    $debug_cmd
+
+    '"$_b"'
+    '"$_d"'
+}'
+
+
+# func_echo ARG...
+# ----------------
+# Echo program name prefixed message.
+func_echo ()
+{
+    $debug_cmd
+
+    _G_message=$*
+
+    func_echo_IFS=$IFS
+    IFS=$nl
+    for _G_line in $_G_message; do
+      IFS=$func_echo_IFS
+      $ECHO "$progname: $_G_line"
+    done
+    IFS=$func_echo_IFS
+}
+
+
+# func_echo_all ARG...
+# --------------------
+# Invoke $ECHO with all args, space-separated.
+func_echo_all ()
+{
+    $ECHO "$*"
+}
+
+
+# func_echo_infix_1 INFIX ARG...
+# ------------------------------
+# Echo program name, followed by INFIX on the first line, with any
+# additional lines not showing INFIX.
+func_echo_infix_1 ()
+{
+    $debug_cmd
+
+    $require_term_colors
+
+    _G_infix=$1; shift
+    _G_indent=$_G_infix
+    _G_prefix="$progname: $_G_infix: "
+    _G_message=$*
+
+    # Strip color escape sequences before counting printable length
+    for _G_tc in "$tc_reset" "$tc_bold" "$tc_standout" "$tc_red" "$tc_green" "$tc_blue" "$tc_cyan"
+    do
+      test -n "$_G_tc" && {
+        _G_esc_tc=`$ECHO "$_G_tc" | $SED "$sed_make_literal_regex"`
+        _G_indent=`$ECHO "$_G_indent" | $SED "s|$_G_esc_tc||g"`
+      }
+    done
+    _G_indent="$progname: "`echo "$_G_indent" | $SED 's|.| |g'`"  " ## exclude from sc_prohibit_nested_quotes
+
+    func_echo_infix_1_IFS=$IFS
+    IFS=$nl
+    for _G_line in $_G_message; do
+      IFS=$func_echo_infix_1_IFS
+      $ECHO "$_G_prefix$tc_bold$_G_line$tc_reset" >&2
+      _G_prefix=$_G_indent
+    done
+    IFS=$func_echo_infix_1_IFS
+}
+
+
+# func_error ARG...
+# -----------------
+# Echo program name prefixed message to standard error.
+func_error ()
+{
+    $debug_cmd
+
+    $require_term_colors
+
+    func_echo_infix_1 "  $tc_standout${tc_red}error$tc_reset" "$*" >&2
+}
+
+
+# func_fatal_error ARG...
+# -----------------------
+# Echo program name prefixed message to standard error, and exit.
+func_fatal_error ()
+{
+    $debug_cmd
+
+    func_error "$*"
+    exit $EXIT_FAILURE
+}
+
+
+# func_grep EXPRESSION FILENAME
+# -----------------------------
 # Check whether EXPRESSION matches any line of FILENAME, without output.
 func_grep ()
 {
+    $debug_cmd
+
     $GREP "$1" "$2" >/dev/null 2>&1
 }
 
 
-# func_mkdir_p directory-path
+# func_len STRING
+# ---------------
+# Set func_len_result to the length of STRING. STRING may not
+# start with a hyphen.
+  test -z "$_G_HAVE_XSI_OPS" \
+    && (eval 'x=a/b/c;
+      test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \
+    && _G_HAVE_XSI_OPS=yes
+
+if test yes = "$_G_HAVE_XSI_OPS"; then
+  eval 'func_len ()
+  {
+    $debug_cmd
+
+    func_len_result=${#1}
+  }'
+else
+  func_len ()
+  {
+    $debug_cmd
+
+    func_len_result=`expr "$1" : ".*" 2>/dev/null || echo $max_cmd_len`
+  }
+fi
+
+
+# func_mkdir_p DIRECTORY-PATH
+# ---------------------------
 # Make sure the entire path to DIRECTORY-PATH is available.
 func_mkdir_p ()
 {
-    my_directory_path="$1"
-    my_dir_list=
+    $debug_cmd
 
-    if test -n "$my_directory_path" && test "$opt_dry_run" != ":"; then
+    _G_directory_path=$1
+    _G_dir_list=
 
-      # Protect directory names starting with `-'
-      case $my_directory_path in
-        -*) my_directory_path="./$my_directory_path" ;;
+    if test -n "$_G_directory_path" && test : != "$opt_dry_run"; then
+
+      # Protect directory names starting with '-'
+      case $_G_directory_path in
+        -*) _G_directory_path=./$_G_directory_path ;;
       esac
 
       # While some portion of DIR does not yet exist...
-      while test ! -d "$my_directory_path"; do
+      while test ! -d "$_G_directory_path"; do
         # ...make a list in topmost first order.  Use a colon delimited
 	# list incase some portion of path contains whitespace.
-        my_dir_list="$my_directory_path:$my_dir_list"
+        _G_dir_list=$_G_directory_path:$_G_dir_list
 
         # If the last portion added has no slash in it, the list is done
-        case $my_directory_path in */*) ;; *) break ;; esac
+        case $_G_directory_path in */*) ;; *) break ;; esac
 
         # ...otherwise throw away the child directory and loop
-        my_directory_path=`$ECHO "$my_directory_path" | $SED -e "$dirname"`
+        _G_directory_path=`$ECHO "$_G_directory_path" | $SED -e "$sed_dirname"`
       done
-      my_dir_list=`$ECHO "$my_dir_list" | $SED 's,:*$,,'`
+      _G_dir_list=`$ECHO "$_G_dir_list" | $SED 's|:*$||'`
 
-      save_mkdir_p_IFS="$IFS"; IFS=':'
-      for my_dir in $my_dir_list; do
-	IFS="$save_mkdir_p_IFS"
-        # mkdir can fail with a `File exist' error if two processes
+      func_mkdir_p_IFS=$IFS; IFS=:
+      for _G_dir in $_G_dir_list; do
+	IFS=$func_mkdir_p_IFS
+        # mkdir can fail with a 'File exist' error if two processes
         # try to create one of the directories concurrently.  Don't
         # stop in that case!
-        $MKDIR "$my_dir" 2>/dev/null || :
+        $MKDIR "$_G_dir" 2>/dev/null || :
       done
-      IFS="$save_mkdir_p_IFS"
+      IFS=$func_mkdir_p_IFS
 
       # Bail out if we (or some other process) failed to create a directory.
-      test -d "$my_directory_path" || \
-        func_fatal_error "Failed to create \`$1'"
+      test -d "$_G_directory_path" || \
+        func_fatal_error "Failed to create '$1'"
     fi
 }
 
 
-# func_mktempdir [string]
+# func_mktempdir [BASENAME]
+# -------------------------
 # Make a temporary directory that won't clash with other running
 # libtool processes, and avoids race conditions if possible.  If
-# given, STRING is the basename for that directory.
+# given, BASENAME is the basename for that directory.
 func_mktempdir ()
 {
-    my_template="${TMPDIR-/tmp}/${1-$progname}"
+    $debug_cmd
+
+    _G_template=${TMPDIR-/tmp}/${1-$progname}
 
-    if test "$opt_dry_run" = ":"; then
+    if test : = "$opt_dry_run"; then
       # Return a directory name, but don't create it in dry-run mode
-      my_tmpdir="${my_template}-$$"
+      _G_tmpdir=$_G_template-$$
     else
 
       # If mktemp works, use that first and foremost
-      my_tmpdir=`mktemp -d "${my_template}-XXXXXXXX" 2>/dev/null`
+      _G_tmpdir=`mktemp -d "$_G_template-XXXXXXXX" 2>/dev/null`
 
-      if test ! -d "$my_tmpdir"; then
+      if test ! -d "$_G_tmpdir"; then
         # Failing that, at least try and use $RANDOM to avoid a race
-        my_tmpdir="${my_template}-${RANDOM-0}$$"
+        _G_tmpdir=$_G_template-${RANDOM-0}$$
 
-        save_mktempdir_umask=`umask`
+        func_mktempdir_umask=`umask`
         umask 0077
-        $MKDIR "$my_tmpdir"
-        umask $save_mktempdir_umask
+        $MKDIR "$_G_tmpdir"
+        umask $func_mktempdir_umask
       fi
 
       # If we're not in dry-run mode, bomb out on failure
-      test -d "$my_tmpdir" || \
-        func_fatal_error "cannot create temporary directory \`$my_tmpdir'"
+      test -d "$_G_tmpdir" || \
+        func_fatal_error "cannot create temporary directory '$_G_tmpdir'"
+    fi
+
+    $ECHO "$_G_tmpdir"
+}
+
+
+# func_normal_abspath PATH
+# ------------------------
+# Remove doubled-up and trailing slashes, "." path components,
+# and cancel out any ".." path components in PATH after making
+# it an absolute path.
+func_normal_abspath ()
+{
+    $debug_cmd
+
+    # These SED scripts presuppose an absolute path with a trailing slash.
+    _G_pathcar='s|^/\([^/]*\).*$|\1|'
+    _G_pathcdr='s|^/[^/]*||'
+    _G_removedotparts=':dotsl
+		s|/\./|/|g
+		t dotsl
+		s|/\.$|/|'
+    _G_collapseslashes='s|/\{1,\}|/|g'
+    _G_finalslash='s|/*$|/|'
+
+    # Start from root dir and reassemble the path.
+    func_normal_abspath_result=
+    func_normal_abspath_tpath=$1
+    func_normal_abspath_altnamespace=
+    case $func_normal_abspath_tpath in
+      "")
+        # Empty path, that just means $cwd.
+        func_stripname '' '/' "`pwd`"
+        func_normal_abspath_result=$func_stripname_result
+        return
+        ;;
+      # The next three entries are used to spot a run of precisely
+      # two leading slashes without using negated character classes;
+      # we take advantage of case's first-match behaviour.
+      ///*)
+        # Unusual form of absolute path, do nothing.
+        ;;
+      //*)
+        # Not necessarily an ordinary path; POSIX reserves leading '//'
+        # and for example Cygwin uses it to access remote file shares
+        # over CIFS/SMB, so we conserve a leading double slash if found.
+        func_normal_abspath_altnamespace=/
+        ;;
+      /*)
+        # Absolute path, do nothing.
+        ;;
+      *)
+        # Relative path, prepend $cwd.
+        func_normal_abspath_tpath=`pwd`/$func_normal_abspath_tpath
+        ;;
+    esac
+
+    # Cancel out all the simple stuff to save iterations.  We also want
+    # the path to end with a slash for ease of parsing, so make sure
+    # there is one (and only one) here.
+    func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
+          -e "$_G_removedotparts" -e "$_G_collapseslashes" -e "$_G_finalslash"`
+    while :; do
+      # Processed it all yet?
+      if test / = "$func_normal_abspath_tpath"; then
+        # If we ascended to the root using ".." the result may be empty now.
+        if test -z "$func_normal_abspath_result"; then
+          func_normal_abspath_result=/
+        fi
+        break
+      fi
+      func_normal_abspath_tcomponent=`$ECHO "$func_normal_abspath_tpath" | $SED \
+          -e "$_G_pathcar"`
+      func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
+          -e "$_G_pathcdr"`
+      # Figure out what to do with it
+      case $func_normal_abspath_tcomponent in
+        "")
+          # Trailing empty path component, ignore it.
+          ;;
+        ..)
+          # Parent dir; strip last assembled component from result.
+          func_dirname "$func_normal_abspath_result"
+          func_normal_abspath_result=$func_dirname_result
+          ;;
+        *)
+          # Actual path component, append it.
+          func_append func_normal_abspath_result "/$func_normal_abspath_tcomponent"
+          ;;
+      esac
+    done
+    # Restore leading double-slash if one was found on entry.
+    func_normal_abspath_result=$func_normal_abspath_altnamespace$func_normal_abspath_result
+}
+
+
+# func_notquiet ARG...
+# --------------------
+# Echo program name prefixed message only when not in quiet mode.
+func_notquiet ()
+{
+    $debug_cmd
+
+    $opt_quiet || func_echo ${1+"$@"}
+
+    # A bug in bash halts the script if the last line of a function
+    # fails when set -e is in force, so we need another command to
+    # work around that:
+    :
+}
+
+
+# func_relative_path SRCDIR DSTDIR
+# --------------------------------
+# Set func_relative_path_result to the relative path from SRCDIR to DSTDIR.
+func_relative_path ()
+{
+    $debug_cmd
+
+    func_relative_path_result=
+    func_normal_abspath "$1"
+    func_relative_path_tlibdir=$func_normal_abspath_result
+    func_normal_abspath "$2"
+    func_relative_path_tbindir=$func_normal_abspath_result
+
+    # Ascend the tree starting from libdir
+    while :; do
+      # check if we have found a prefix of bindir
+      case $func_relative_path_tbindir in
+        $func_relative_path_tlibdir)
+          # found an exact match
+          func_relative_path_tcancelled=
+          break
+          ;;
+        $func_relative_path_tlibdir*)
+          # found a matching prefix
+          func_stripname "$func_relative_path_tlibdir" '' "$func_relative_path_tbindir"
+          func_relative_path_tcancelled=$func_stripname_result
+          if test -z "$func_relative_path_result"; then
+            func_relative_path_result=.
+          fi
+          break
+          ;;
+        *)
+          func_dirname $func_relative_path_tlibdir
+          func_relative_path_tlibdir=$func_dirname_result
+          if test -z "$func_relative_path_tlibdir"; then
+            # Have to descend all the way to the root!
+            func_relative_path_result=../$func_relative_path_result
+            func_relative_path_tcancelled=$func_relative_path_tbindir
+            break
+          fi
+          func_relative_path_result=../$func_relative_path_result
+          ;;
+      esac
+    done
+
+    # Now calculate path; take care to avoid doubling-up slashes.
+    func_stripname '' '/' "$func_relative_path_result"
+    func_relative_path_result=$func_stripname_result
+    func_stripname '/' '/' "$func_relative_path_tcancelled"
+    if test -n "$func_stripname_result"; then
+      func_append func_relative_path_result "/$func_stripname_result"
+    fi
+
+    # Normalisation. If bindir is libdir, return '.' else relative path.
+    if test -n "$func_relative_path_result"; then
+      func_stripname './' '' "$func_relative_path_result"
+      func_relative_path_result=$func_stripname_result
     fi
 
-    $ECHO "$my_tmpdir"
+    test -n "$func_relative_path_result" || func_relative_path_result=.
+
+    :
+}
+
+
+# func_quote_for_eval ARG...
+# --------------------------
+# Aesthetically quote ARGs to be evaled later.
+# This function returns two values:
+#   i) func_quote_for_eval_result
+#      double-quoted, suitable for a subsequent eval
+#  ii) func_quote_for_eval_unquoted_result
+#      has all characters that are still active within double
+#      quotes backslashified.
+func_quote_for_eval ()
+{
+    $debug_cmd
+
+    func_quote_for_eval_unquoted_result=
+    func_quote_for_eval_result=
+    while test 0 -lt $#; do
+      case $1 in
+        *[\\\`\"\$]*)
+	  _G_unquoted_arg=`printf '%s\n' "$1" |$SED "$sed_quote_subst"` ;;
+        *)
+          _G_unquoted_arg=$1 ;;
+      esac
+      if test -n "$func_quote_for_eval_unquoted_result"; then
+	func_append func_quote_for_eval_unquoted_result " $_G_unquoted_arg"
+      else
+        func_append func_quote_for_eval_unquoted_result "$_G_unquoted_arg"
+      fi
+
+      case $_G_unquoted_arg in
+        # Double-quote args containing shell metacharacters to delay
+        # word splitting, command substitution and variable expansion
+        # for a subsequent eval.
+        # Many Bourne shells cannot handle close brackets correctly
+        # in scan sets, so we specify it separately.
+        *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+          _G_quoted_arg=\"$_G_unquoted_arg\"
+          ;;
+        *)
+          _G_quoted_arg=$_G_unquoted_arg
+	  ;;
+      esac
+
+      if test -n "$func_quote_for_eval_result"; then
+	func_append func_quote_for_eval_result " $_G_quoted_arg"
+      else
+        func_append func_quote_for_eval_result "$_G_quoted_arg"
+      fi
+      shift
+    done
+}
+
+
+# func_quote_for_expand ARG
+# -------------------------
+# Aesthetically quote ARG to be evaled later; same as above,
+# but do not quote variable references.
+func_quote_for_expand ()
+{
+    $debug_cmd
+
+    case $1 in
+      *[\\\`\"]*)
+	_G_arg=`$ECHO "$1" | $SED \
+	    -e "$sed_double_quote_subst" -e "$sed_double_backslash"` ;;
+      *)
+        _G_arg=$1 ;;
+    esac
+
+    case $_G_arg in
+      # Double-quote args containing shell metacharacters to delay
+      # word splitting and command substitution for a subsequent eval.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+        _G_arg=\"$_G_arg\"
+        ;;
+    esac
+
+    func_quote_for_expand_result=$_G_arg
+}
+
+
+# func_stripname PREFIX SUFFIX NAME
+# ---------------------------------
+# strip PREFIX and SUFFIX from NAME, and store in func_stripname_result.
+# PREFIX and SUFFIX must not contain globbing or regex special
+# characters, hashes, percent signs, but SUFFIX may contain a leading
+# dot (in which case that matches only a dot).
+if test yes = "$_G_HAVE_XSI_OPS"; then
+  eval 'func_stripname ()
+  {
+    $debug_cmd
+
+    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are
+    # positional parameters, so assign one to ordinary variable first.
+    func_stripname_result=$3
+    func_stripname_result=${func_stripname_result#"$1"}
+    func_stripname_result=${func_stripname_result%"$2"}
+  }'
+else
+  func_stripname ()
+  {
+    $debug_cmd
+
+    case $2 in
+      .*) func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%\\\\$2\$%%"`;;
+      *)  func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%$2\$%%"`;;
+    esac
+  }
+fi
+
+
+# func_show_eval CMD [FAIL_EXP]
+# -----------------------------
+# Unless opt_quiet is true, then output CMD.  Then, if opt_dryrun is
+# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
+# is given, then evaluate it.
+func_show_eval ()
+{
+    $debug_cmd
+
+    _G_cmd=$1
+    _G_fail_exp=${2-':'}
+
+    func_quote_for_expand "$_G_cmd"
+    eval "func_notquiet $func_quote_for_expand_result"
+
+    $opt_dry_run || {
+      eval "$_G_cmd"
+      _G_status=$?
+      if test 0 -ne "$_G_status"; then
+	eval "(exit $_G_status); $_G_fail_exp"
+      fi
+    }
+}
+
+
+# func_show_eval_locale CMD [FAIL_EXP]
+# ------------------------------------
+# Unless opt_quiet is true, then output CMD.  Then, if opt_dryrun is
+# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
+# is given, then evaluate it.  Use the saved locale for evaluation.
+func_show_eval_locale ()
+{
+    $debug_cmd
+
+    _G_cmd=$1
+    _G_fail_exp=${2-':'}
+
+    $opt_quiet || {
+      func_quote_for_expand "$_G_cmd"
+      eval "func_echo $func_quote_for_expand_result"
+    }
+
+    $opt_dry_run || {
+      eval "$_G_user_locale
+	    $_G_cmd"
+      _G_status=$?
+      eval "$_G_safe_locale"
+      if test 0 -ne "$_G_status"; then
+	eval "(exit $_G_status); $_G_fail_exp"
+      fi
+    }
+}
+
+
+# func_tr_sh
+# ----------
+# Turn $1 into a string suitable for a shell variable name.
+# Result is stored in $func_tr_sh_result.  All characters
+# not in the set a-zA-Z0-9_ are replaced with '_'. Further,
+# if $1 begins with a digit, a '_' is prepended as well.
+func_tr_sh ()
+{
+    $debug_cmd
+
+    case $1 in
+    [0-9]* | *[!a-zA-Z0-9_]*)
+      func_tr_sh_result=`$ECHO "$1" | $SED -e 's/^\([0-9]\)/_\1/' -e 's/[^a-zA-Z0-9_]/_/g'`
+      ;;
+    * )
+      func_tr_sh_result=$1
+      ;;
+    esac
+}
+
+
+# func_verbose ARG...
+# -------------------
+# Echo program name prefixed message in verbose mode only.
+func_verbose ()
+{
+    $debug_cmd
+
+    $opt_verbose && func_echo "$*"
+
+    :
+}
+
+
+# func_warn_and_continue ARG...
+# -----------------------------
+# Echo program name prefixed warning message to standard error.
+func_warn_and_continue ()
+{
+    $debug_cmd
+
+    $require_term_colors
+
+    func_echo_infix_1 "${tc_red}warning$tc_reset" "$*" >&2
+}
+
+
+# func_warning CATEGORY ARG...
+# ----------------------------
+# Echo program name prefixed warning message to standard error. Warning
+# messages can be filtered according to CATEGORY, where this function
+# elides messages where CATEGORY is not listed in the global variable
+# 'opt_warning_types'.
+func_warning ()
+{
+    $debug_cmd
+
+    # CATEGORY must be in the warning_categories list!
+    case " $warning_categories " in
+      *" $1 "*) ;;
+      *) func_internal_error "invalid warning category '$1'" ;;
+    esac
+
+    _G_category=$1
+    shift
+
+    case " $opt_warning_types " in
+      *" $_G_category "*) $warning_func ${1+"$@"} ;;
+    esac
+}
+
+
+# func_sort_ver VER1 VER2
+# -----------------------
+# 'sort -V' is not generally available.
+# Note this deviates from the version comparison in automake
+# in that it treats 1.5 < 1.5.0, and treats 1.4.4a < 1.4-p3a
+# but this should suffice as we won't be specifying old
+# version formats or redundant trailing .0 in bootstrap.conf.
+# If we did want full compatibility then we should probably
+# use m4_version_compare from autoconf.
+func_sort_ver ()
+{
+    $debug_cmd
+
+    printf '%s\n%s\n' "$1" "$2" \
+      | sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n -k 6,6n -k 7,7n -k 8,8n -k 9,9n
+}
+
+# func_lt_ver PREV CURR
+# ---------------------
+# Return true if PREV and CURR are in the correct order according to
+# func_sort_ver, otherwise false.  Use it like this:
+#
+#  func_lt_ver "$prev_ver" "$proposed_ver" || func_fatal_error "..."
+func_lt_ver ()
+{
+    $debug_cmd
+
+    test "x$1" = x`func_sort_ver "$1" "$2" | $SED 1q`
+}
+
+
+# Local variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC"
+# time-stamp-time-zone: "UTC"
+# End:
+#! /bin/sh
+
+# Set a version string for this script.
+scriptversion=2014-01-07.03; # UTC
+
+# A portable, pluggable option parser for Bourne shell.
+# Written by Gary V. Vaughan, 2010
+
+# Copyright (C) 2010-2015 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions.  There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Please report bugs or propose patches to gary@gnu.org.
+
+
+## ------ ##
+## Usage. ##
+## ------ ##
+
+# This file is a library for parsing options in your shell scripts along
+# with assorted other useful supporting features that you can make use
+# of too.
+#
+# For the simplest scripts you might need only:
+#
+#   #!/bin/sh
+#   . relative/path/to/funclib.sh
+#   . relative/path/to/options-parser
+#   scriptversion=1.0
+#   func_options ${1+"$@"}
+#   eval set dummy "$func_options_result"; shift
+#   ...rest of your script...
+#
+# In order for the '--version' option to work, you will need to have a
+# suitably formatted comment like the one at the top of this file
+# starting with '# Written by ' and ending with '# warranty; '.
+#
+# For '-h' and '--help' to work, you will also need a one line
+# description of your script's purpose in a comment directly above the
+# '# Written by ' line, like the one at the top of this file.
+#
+# The default options also support '--debug', which will turn on shell
+# execution tracing (see the comment above debug_cmd below for another
+# use), and '--verbose' and the func_verbose function to allow your script
+# to display verbose messages only when your user has specified
+# '--verbose'.
+#
+# After sourcing this file, you can plug processing for additional
+# options by amending the variables from the 'Configuration' section
+# below, and following the instructions in the 'Option parsing'
+# section further down.
+
+## -------------- ##
+## Configuration. ##
+## -------------- ##
+
+# You should override these variables in your script after sourcing this
+# file so that they reflect the customisations you have added to the
+# option parser.
+
+# The usage line for option parsing errors and the start of '-h' and
+# '--help' output messages. You can embed shell variables for delayed
+# expansion at the time the message is displayed, but you will need to
+# quote other shell meta-characters carefully to prevent them being
+# expanded when the contents are evaled.
+usage='$progpath [OPTION]...'
+
+# Short help message in response to '-h' and '--help'.  Add to this or
+# override it after sourcing this library to reflect the full set of
+# options your script accepts.
+usage_message="\
+       --debug        enable verbose shell tracing
+   -W, --warnings=CATEGORY
+                      report the warnings falling in CATEGORY [all]
+   -v, --verbose      verbosely report processing
+       --version      print version information and exit
+   -h, --help         print short or long help message and exit
+"
+
+# Additional text appended to 'usage_message' in response to '--help'.
+long_help_message="
+Warning categories include:
+       'all'          show all warnings
+       'none'         turn off all the warnings
+       'error'        warnings are treated as fatal errors"
+
+# Help message printed before fatal option parsing errors.
+fatal_help="Try '\$progname --help' for more information."
+
+
+
+## ------------------------- ##
+## Hook function management. ##
+## ------------------------- ##
+
+# This section contains functions for adding, removing, and running hooks
+# to the main code.  A hook is just a named list of of function, that can
+# be run in order later on.
+
+# func_hookable FUNC_NAME
+# -----------------------
+# Declare that FUNC_NAME will run hooks added with
+# 'func_add_hook FUNC_NAME ...'.
+func_hookable ()
+{
+    $debug_cmd
+
+    func_append hookable_fns " $1"
+}
+
+
+# func_add_hook FUNC_NAME HOOK_FUNC
+# ---------------------------------
+# Request that FUNC_NAME call HOOK_FUNC before it returns.  FUNC_NAME must
+# first have been declared "hookable" by a call to 'func_hookable'.
+func_add_hook ()
+{
+    $debug_cmd
+
+    case " $hookable_fns " in
+      *" $1 "*) ;;
+      *) func_fatal_error "'$1' does not accept hook functions." ;;
+    esac
+
+    eval func_append ${1}_hooks '" $2"'
+}
+
+
+# func_remove_hook FUNC_NAME HOOK_FUNC
+# ------------------------------------
+# Remove HOOK_FUNC from the list of functions called by FUNC_NAME.
+func_remove_hook ()
+{
+    $debug_cmd
+
+    eval ${1}_hooks='`$ECHO "\$'$1'_hooks" |$SED "s| '$2'||"`'
+}
+
+
+# func_run_hooks FUNC_NAME [ARG]...
+# ---------------------------------
+# Run all hook functions registered to FUNC_NAME.
+# It is assumed that the list of hook functions contains nothing more
+# than a whitespace-delimited list of legal shell function names, and
+# no effort is wasted trying to catch shell meta-characters or preserve
+# whitespace.
+func_run_hooks ()
+{
+    $debug_cmd
+
+    case " $hookable_fns " in
+      *" $1 "*) ;;
+      *) func_fatal_error "'$1' does not support hook funcions.n" ;;
+    esac
+
+    eval _G_hook_fns=\$$1_hooks; shift
+
+    for _G_hook in $_G_hook_fns; do
+      eval $_G_hook '"$@"'
+
+      # store returned options list back into positional
+      # parameters for next 'cmd' execution.
+      eval _G_hook_result=\$${_G_hook}_result
+      eval set dummy "$_G_hook_result"; shift
+    done
+
+    func_quote_for_eval ${1+"$@"}
+    func_run_hooks_result=$func_quote_for_eval_result
+}
+
+
+
+## --------------- ##
+## Option parsing. ##
+## --------------- ##
+
+# In order to add your own option parsing hooks, you must accept the
+# full positional parameter list in your hook function, remove any
+# options that you action, and then pass back the remaining unprocessed
+# options in '<hooked_function_name>_result', escaped suitably for
+# 'eval'.  Like this:
+#
+#    my_options_prep ()
+#    {
+#        $debug_cmd
+#
+#        # Extend the existing usage message.
+#        usage_message=$usage_message'
+#      -s, --silent       don'\''t print informational messages
+#    '
+#
+#        func_quote_for_eval ${1+"$@"}
+#        my_options_prep_result=$func_quote_for_eval_result
+#    }
+#    func_add_hook func_options_prep my_options_prep
+#
+#
+#    my_silent_option ()
+#    {
+#        $debug_cmd
+#
+#        # Note that for efficiency, we parse as many options as we can
+#        # recognise in a loop before passing the remainder back to the
+#        # caller on the first unrecognised argument we encounter.
+#        while test $# -gt 0; do
+#          opt=$1; shift
+#          case $opt in
+#            --silent|-s) opt_silent=: ;;
+#            # Separate non-argument short options:
+#            -s*)         func_split_short_opt "$_G_opt"
+#                         set dummy "$func_split_short_opt_name" \
+#                             "-$func_split_short_opt_arg" ${1+"$@"}
+#                         shift
+#                         ;;
+#            *)            set dummy "$_G_opt" "$*"; shift; break ;;
+#          esac
+#        done
+#
+#        func_quote_for_eval ${1+"$@"}
+#        my_silent_option_result=$func_quote_for_eval_result
+#    }
+#    func_add_hook func_parse_options my_silent_option
+#
+#
+#    my_option_validation ()
+#    {
+#        $debug_cmd
+#
+#        $opt_silent && $opt_verbose && func_fatal_help "\
+#    '--silent' and '--verbose' options are mutually exclusive."
+#
+#        func_quote_for_eval ${1+"$@"}
+#        my_option_validation_result=$func_quote_for_eval_result
+#    }
+#    func_add_hook func_validate_options my_option_validation
+#
+# You'll alse need to manually amend $usage_message to reflect the extra
+# options you parse.  It's preferable to append if you can, so that
+# multiple option parsing hooks can be added safely.
+
+
+# func_options [ARG]...
+# ---------------------
+# All the functions called inside func_options are hookable. See the
+# individual implementations for details.
+func_hookable func_options
+func_options ()
+{
+    $debug_cmd
+
+    func_options_prep ${1+"$@"}
+    eval func_parse_options \
+        ${func_options_prep_result+"$func_options_prep_result"}
+    eval func_validate_options \
+        ${func_parse_options_result+"$func_parse_options_result"}
+
+    eval func_run_hooks func_options \
+        ${func_validate_options_result+"$func_validate_options_result"}
+
+    # save modified positional parameters for caller
+    func_options_result=$func_run_hooks_result
 }
 
 
-# func_quote_for_eval arg
-# Aesthetically quote ARG to be evaled later.
-# This function returns two values: FUNC_QUOTE_FOR_EVAL_RESULT
-# is double-quoted, suitable for a subsequent eval, whereas
-# FUNC_QUOTE_FOR_EVAL_UNQUOTED_RESULT has merely all characters
-# which are still active within double quotes backslashified.
-func_quote_for_eval ()
+# func_options_prep [ARG]...
+# --------------------------
+# All initialisations required before starting the option parse loop.
+# Note that when calling hook functions, we pass through the list of
+# positional parameters.  If a hook function modifies that list, and
+# needs to propogate that back to rest of this script, then the complete
+# modified list must be put in 'func_run_hooks_result' before
+# returning.
+func_hookable func_options_prep
+func_options_prep ()
 {
-    case $1 in
-      *[\\\`\"\$]*)
-	func_quote_for_eval_unquoted_result=`$ECHO "$1" | $SED "$sed_quote_subst"` ;;
-      *)
-        func_quote_for_eval_unquoted_result="$1" ;;
-    esac
+    $debug_cmd
 
-    case $func_quote_for_eval_unquoted_result in
-      # Double-quote args containing shell metacharacters to delay
-      # word splitting, command substitution and and variable
-      # expansion for a subsequent eval.
-      # Many Bourne shells cannot handle close brackets correctly
-      # in scan sets, so we specify it separately.
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
-        func_quote_for_eval_result="\"$func_quote_for_eval_unquoted_result\""
-        ;;
-      *)
-        func_quote_for_eval_result="$func_quote_for_eval_unquoted_result"
-    esac
+    # Option defaults:
+    opt_verbose=false
+    opt_warning_types=
+
+    func_run_hooks func_options_prep ${1+"$@"}
+
+    # save modified positional parameters for caller
+    func_options_prep_result=$func_run_hooks_result
 }
 
 
-# func_quote_for_expand arg
-# Aesthetically quote ARG to be evaled later; same as above,
-# but do not quote variable references.
-func_quote_for_expand ()
+# func_parse_options [ARG]...
+# ---------------------------
+# The main option parsing loop.
+func_hookable func_parse_options
+func_parse_options ()
 {
-    case $1 in
-      *[\\\`\"]*)
-	my_arg=`$ECHO "$1" | $SED \
-	    -e "$double_quote_subst" -e "$sed_double_backslash"` ;;
-      *)
-        my_arg="$1" ;;
-    esac
+    $debug_cmd
 
-    case $my_arg in
-      # Double-quote args containing shell metacharacters to delay
-      # word splitting and command substitution for a subsequent eval.
-      # Many Bourne shells cannot handle close brackets correctly
-      # in scan sets, so we specify it separately.
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
-        my_arg="\"$my_arg\""
-        ;;
-    esac
+    func_parse_options_result=
 
-    func_quote_for_expand_result="$my_arg"
-}
+    # this just eases exit handling
+    while test $# -gt 0; do
+      # Defer to hook functions for initial option parsing, so they
+      # get priority in the event of reusing an option name.
+      func_run_hooks func_parse_options ${1+"$@"}
 
+      # Adjust func_parse_options positional parameters to match
+      eval set dummy "$func_run_hooks_result"; shift
 
-# func_show_eval cmd [fail_exp]
-# Unless opt_silent is true, then output CMD.  Then, if opt_dryrun is
-# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
-# is given, then evaluate it.
-func_show_eval ()
-{
-    my_cmd="$1"
-    my_fail_exp="${2-:}"
+      # Break out of the loop if we already parsed every option.
+      test $# -gt 0 || break
 
-    ${opt_silent-false} || {
-      func_quote_for_expand "$my_cmd"
-      eval "func_echo $func_quote_for_expand_result"
-    }
+      _G_opt=$1
+      shift
+      case $_G_opt in
+        --debug|-x)   debug_cmd='set -x'
+                      func_echo "enabling shell trace mode"
+                      $debug_cmd
+                      ;;
+
+        --no-warnings|--no-warning|--no-warn)
+                      set dummy --warnings none ${1+"$@"}
+                      shift
+		      ;;
 
-    if ${opt_dry_run-false}; then :; else
-      eval "$my_cmd"
-      my_status=$?
-      if test "$my_status" -eq 0; then :; else
-	eval "(exit $my_status); $my_fail_exp"
-      fi
-    fi
+        --warnings|--warning|-W)
+                      test $# = 0 && func_missing_arg $_G_opt && break
+                      case " $warning_categories $1" in
+                        *" $1 "*)
+                          # trailing space prevents matching last $1 above
+                          func_append_uniq opt_warning_types " $1"
+                          ;;
+                        *all)
+                          opt_warning_types=$warning_categories
+                          ;;
+                        *none)
+                          opt_warning_types=none
+                          warning_func=:
+                          ;;
+                        *error)
+                          opt_warning_types=$warning_categories
+                          warning_func=func_fatal_error
+                          ;;
+                        *)
+                          func_fatal_error \
+                             "unsupported warning category: '$1'"
+                          ;;
+                      esac
+                      shift
+                      ;;
+
+        --verbose|-v) opt_verbose=: ;;
+        --version)    func_version ;;
+        -\?|-h)       func_usage ;;
+        --help)       func_help ;;
+
+	# Separate optargs to long options (plugins may need this):
+	--*=*)        func_split_equals "$_G_opt"
+	              set dummy "$func_split_equals_lhs" \
+                          "$func_split_equals_rhs" ${1+"$@"}
+                      shift
+                      ;;
+
+       # Separate optargs to short options:
+        -W*)
+                      func_split_short_opt "$_G_opt"
+                      set dummy "$func_split_short_opt_name" \
+                          "$func_split_short_opt_arg" ${1+"$@"}
+                      shift
+                      ;;
+
+        # Separate non-argument short options:
+        -\?*|-h*|-v*|-x*)
+                      func_split_short_opt "$_G_opt"
+                      set dummy "$func_split_short_opt_name" \
+                          "-$func_split_short_opt_arg" ${1+"$@"}
+                      shift
+                      ;;
+
+        --)           break ;;
+        -*)           func_fatal_help "unrecognised option: '$_G_opt'" ;;
+        *)            set dummy "$_G_opt" ${1+"$@"}; shift; break ;;
+      esac
+    done
+
+    # save modified positional parameters for caller
+    func_quote_for_eval ${1+"$@"}
+    func_parse_options_result=$func_quote_for_eval_result
 }
 
 
-# func_show_eval_locale cmd [fail_exp]
-# Unless opt_silent is true, then output CMD.  Then, if opt_dryrun is
-# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
-# is given, then evaluate it.  Use the saved locale for evaluation.
-func_show_eval_locale ()
+# func_validate_options [ARG]...
+# ------------------------------
+# Perform any sanity checks on option settings and/or unconsumed
+# arguments.
+func_hookable func_validate_options
+func_validate_options ()
 {
-    my_cmd="$1"
-    my_fail_exp="${2-:}"
+    $debug_cmd
 
-    ${opt_silent-false} || {
-      func_quote_for_expand "$my_cmd"
-      eval "func_echo $func_quote_for_expand_result"
-    }
+    # Display all warnings if -W was not given.
+    test -n "$opt_warning_types" || opt_warning_types=" $warning_categories"
 
-    if ${opt_dry_run-false}; then :; else
-      eval "$lt_user_locale
-	    $my_cmd"
-      my_status=$?
-      eval "$lt_safe_locale"
-      if test "$my_status" -eq 0; then :; else
-	eval "(exit $my_status); $my_fail_exp"
-      fi
-    fi
-}
+    func_run_hooks func_validate_options ${1+"$@"}
 
-# func_tr_sh
-# Turn $1 into a string suitable for a shell variable name.
-# Result is stored in $func_tr_sh_result.  All characters
-# not in the set a-zA-Z0-9_ are replaced with '_'. Further,
-# if $1 begins with a digit, a '_' is prepended as well.
-func_tr_sh ()
-{
-  case $1 in
-  [0-9]* | *[!a-zA-Z0-9_]*)
-    func_tr_sh_result=`$ECHO "$1" | $SED 's/^\([0-9]\)/_\1/; s/[^a-zA-Z0-9_]/_/g'`
-    ;;
-  * )
-    func_tr_sh_result=$1
-    ;;
-  esac
+    # Bail if the options were screwed!
+    $exit_cmd $EXIT_FAILURE
+
+    # save modified positional parameters for caller
+    func_validate_options_result=$func_run_hooks_result
 }
 
 
-# func_version
-# Echo version message to standard output and exit.
-func_version ()
-{
-    $opt_debug
 
-    $SED -n '/(C)/!b go
-	:more
-	/\./!{
-	  N
-	  s/\n# / /
-	  b more
-	}
-	:go
-	/^# '$PROGRAM' (GNU /,/# warranty; / {
-        s/^# //
-	s/^# *$//
-        s/\((C)\)[ 0-9,-]*\( [1-9][0-9]*\)/\1\2/
-        p
-     }' < "$progpath"
-     exit $?
-}
+## ----------------- ##
+## Helper functions. ##
+## ----------------- ##
 
-# func_usage
-# Echo short help message to standard output and exit.
-func_usage ()
+# This section contains the helper functions used by the rest of the
+# hookable option parser framework in ascii-betical order.
+
+
+# func_fatal_help ARG...
+# ----------------------
+# Echo program name prefixed message to standard error, followed by
+# a help hint, and exit.
+func_fatal_help ()
 {
-    $opt_debug
+    $debug_cmd
 
-    $SED -n '/^# Usage:/,/^#  *.*--help/ {
-        s/^# //
-	s/^# *$//
-	s/\$progname/'$progname'/
-	p
-    }' < "$progpath"
-    echo
-    $ECHO "run \`$progname --help | more' for full usage"
-    exit $?
+    eval \$ECHO \""Usage: $usage"\"
+    eval \$ECHO \""$fatal_help"\"
+    func_error ${1+"$@"}
+    exit $EXIT_FAILURE
 }
 
-# func_help [NOEXIT]
-# Echo long help message to standard output and exit,
-# unless 'noexit' is passed as argument.
+
+# func_help
+# ---------
+# Echo long help message to standard output and exit.
 func_help ()
 {
-    $opt_debug
-
-    $SED -n '/^# Usage:/,/# Report bugs to/ {
-	:print
-        s/^# //
-	s/^# *$//
-	s*\$progname*'$progname'*
-	s*\$host*'"$host"'*
-	s*\$SHELL*'"$SHELL"'*
-	s*\$LTCC*'"$LTCC"'*
-	s*\$LTCFLAGS*'"$LTCFLAGS"'*
-	s*\$LD*'"$LD"'*
-	s/\$with_gnu_ld/'"$with_gnu_ld"'/
-	s/\$automake_version/'"`(${AUTOMAKE-automake} --version) 2>/dev/null |$SED 1q`"'/
-	s/\$autoconf_version/'"`(${AUTOCONF-autoconf} --version) 2>/dev/null |$SED 1q`"'/
-	p
-	d
-     }
-     /^# .* home page:/b print
-     /^# General help using/b print
-     ' < "$progpath"
-    ret=$?
-    if test -z "$1"; then
-      exit $ret
-    fi
+    $debug_cmd
+
+    func_usage_message
+    $ECHO "$long_help_message"
+    exit 0
 }
 
-# func_missing_arg argname
+
+# func_missing_arg ARGNAME
+# ------------------------
 # Echo program name prefixed message to standard error and set global
 # exit_cmd.
 func_missing_arg ()
 {
-    $opt_debug
+    $debug_cmd
 
-    func_error "missing argument for $1."
+    func_error "Missing argument for '$1'."
     exit_cmd=exit
 }
 
 
-# func_split_short_opt shortopt
+# func_split_equals STRING
+# ------------------------
+# Set func_split_equals_lhs and func_split_equals_rhs shell variables after
+# splitting STRING at the '=' sign.
+test -z "$_G_HAVE_XSI_OPS" \
+    && (eval 'x=a/b/c;
+      test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \
+    && _G_HAVE_XSI_OPS=yes
+
+if test yes = "$_G_HAVE_XSI_OPS"
+then
+  # This is an XSI compatible shell, allowing a faster implementation...
+  eval 'func_split_equals ()
+  {
+      $debug_cmd
+
+      func_split_equals_lhs=${1%%=*}
+      func_split_equals_rhs=${1#*=}
+      test "x$func_split_equals_lhs" = "x$1" \
+        && func_split_equals_rhs=
+  }'
+else
+  # ...otherwise fall back to using expr, which is often a shell builtin.
+  func_split_equals ()
+  {
+      $debug_cmd
+
+      func_split_equals_lhs=`expr "x$1" : 'x\([^=]*\)'`
+      func_split_equals_rhs=
+      test "x$func_split_equals_lhs" = "x$1" \
+        || func_split_equals_rhs=`expr "x$1" : 'x[^=]*=\(.*\)$'`
+  }
+fi #func_split_equals
+
+
+# func_split_short_opt SHORTOPT
+# -----------------------------
 # Set func_split_short_opt_name and func_split_short_opt_arg shell
 # variables after splitting SHORTOPT after the 2nd character.
-func_split_short_opt ()
+if test yes = "$_G_HAVE_XSI_OPS"
+then
+  # This is an XSI compatible shell, allowing a faster implementation...
+  eval 'func_split_short_opt ()
+  {
+      $debug_cmd
+
+      func_split_short_opt_arg=${1#??}
+      func_split_short_opt_name=${1%"$func_split_short_opt_arg"}
+  }'
+else
+  # ...otherwise fall back to using expr, which is often a shell builtin.
+  func_split_short_opt ()
+  {
+      $debug_cmd
+
+      func_split_short_opt_name=`expr "x$1" : 'x-\(.\)'`
+      func_split_short_opt_arg=`expr "x$1" : 'x-.\(.*\)$'`
+  }
+fi #func_split_short_opt
+
+
+# func_usage
+# ----------
+# Echo short help message to standard output and exit.
+func_usage ()
 {
-    my_sed_short_opt='1s/^\(..\).*$/\1/;q'
-    my_sed_short_rest='1s/^..\(.*\)$/\1/;q'
+    $debug_cmd
 
-    func_split_short_opt_name=`$ECHO "$1" | $SED "$my_sed_short_opt"`
-    func_split_short_opt_arg=`$ECHO "$1" | $SED "$my_sed_short_rest"`
-} # func_split_short_opt may be replaced by extended shell implementation
+    func_usage_message
+    $ECHO "Run '$progname --help |${PAGER-more}' for full usage"
+    exit 0
+}
 
 
-# func_split_long_opt longopt
-# Set func_split_long_opt_name and func_split_long_opt_arg shell
-# variables after splitting LONGOPT at the `=' sign.
-func_split_long_opt ()
+# func_usage_message
+# ------------------
+# Echo short help message to standard output.
+func_usage_message ()
 {
-    my_sed_long_opt='1s/^\(--[^=]*\)=.*/\1/;q'
-    my_sed_long_arg='1s/^--[^=]*=//'
+    $debug_cmd
 
-    func_split_long_opt_name=`$ECHO "$1" | $SED "$my_sed_long_opt"`
-    func_split_long_opt_arg=`$ECHO "$1" | $SED "$my_sed_long_arg"`
-} # func_split_long_opt may be replaced by extended shell implementation
+    eval \$ECHO \""Usage: $usage"\"
+    echo
+    $SED -n 's|^# ||
+        /^Written by/{
+          x;p;x
+        }
+	h
+	/^Written by/q' < "$progpath"
+    echo
+    eval \$ECHO \""$usage_message"\"
+}
 
-exit_cmd=:
 
+# func_version
+# ------------
+# Echo version message to standard output and exit.
+func_version ()
+{
+    $debug_cmd
 
+    printf '%s\n' "$progname $scriptversion"
+    $SED -n '
+        /(C)/!b go
+        :more
+        /\./!{
+          N
+          s|\n# | |
+          b more
+        }
+        :go
+        /^# Written by /,/# warranty; / {
+          s|^# ||
+          s|^# *$||
+          s|\((C)\)[ 0-9,-]*[ ,-]\([1-9][0-9]* \)|\1 \2|
+          p
+        }
+        /^# Written by / {
+          s|^# ||
+          p
+        }
+        /^warranty; /q' < "$progpath"
 
+    exit $?
+}
 
 
-magic="%%%MAGIC variable%%%"
-magic_exe="%%%MAGIC EXE variable%%%"
+# Local variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC"
+# time-stamp-time-zone: "UTC"
+# End:
 
-# Global variables.
-nonopt=
-preserve_args=
-lo2o="s/\\.lo\$/.${objext}/"
-o2lo="s/\\.${objext}\$/.lo/"
-extracted_archives=
-extracted_serial=0
+# Set a version string.
+scriptversion='(GNU libtool) 2.4.6'
 
-# If this variable is set in any of the actions, the command in it
-# will be execed at the end.  This prevents here-documents from being
-# left over by shells.
-exec_cmd=
 
-# func_append var value
-# Append VALUE to the end of shell variable VAR.
-func_append ()
+# func_echo ARG...
+# ----------------
+# Libtool also displays the current mode in messages, so override
+# funclib.sh func_echo with this custom definition.
+func_echo ()
 {
-    eval "${1}=\$${1}\${2}"
-} # func_append may be replaced by extended shell implementation
+    $debug_cmd
 
-# func_append_quoted var value
-# Quote VALUE and append to the end of shell variable VAR, separated
-# by a space.
-func_append_quoted ()
-{
-    func_quote_for_eval "${2}"
-    eval "${1}=\$${1}\\ \$func_quote_for_eval_result"
-} # func_append_quoted may be replaced by extended shell implementation
+    _G_message=$*
 
+    func_echo_IFS=$IFS
+    IFS=$nl
+    for _G_line in $_G_message; do
+      IFS=$func_echo_IFS
+      $ECHO "$progname${opt_mode+: $opt_mode}: $_G_line"
+    done
+    IFS=$func_echo_IFS
+}
 
-# func_arith arithmetic-term...
-func_arith ()
+
+# func_warning ARG...
+# -------------------
+# Libtool warnings are not categorized, so override funclib.sh
+# func_warning with this simpler definition.
+func_warning ()
 {
-    func_arith_result=`expr "${@}"`
-} # func_arith may be replaced by extended shell implementation
+    $debug_cmd
 
+    $warning_func ${1+"$@"}
+}
 
-# func_len string
-# STRING may not start with a hyphen.
-func_len ()
-{
-    func_len_result=`expr "${1}" : ".*" 2>/dev/null || echo $max_cmd_len`
-} # func_len may be replaced by extended shell implementation
 
+## ---------------- ##
+## Options parsing. ##
+## ---------------- ##
+
+# Hook in the functions to make sure our own options are parsed during
+# the option parsing loop.
+
+usage='$progpath [OPTION]... [MODE-ARG]...'
+
+# Short help message in response to '-h'.
+usage_message="Options:
+       --config             show all configuration variables
+       --debug              enable verbose shell tracing
+   -n, --dry-run            display commands without modifying any files
+       --features           display basic configuration information and exit
+       --mode=MODE          use operation mode MODE
+       --no-warnings        equivalent to '-Wnone'
+       --preserve-dup-deps  don't remove duplicate dependency libraries
+       --quiet, --silent    don't print informational messages
+       --tag=TAG            use configuration variables from tag TAG
+   -v, --verbose            print more informational messages than default
+       --version            print version information
+   -W, --warnings=CATEGORY  report the warnings falling in CATEGORY [all]
+   -h, --help, --help-all   print short, long, or detailed help message
+"
 
-# func_lo2o object
-func_lo2o ()
+# Additional text appended to 'usage_message' in response to '--help'.
+func_help ()
 {
-    func_lo2o_result=`$ECHO "${1}" | $SED "$lo2o"`
-} # func_lo2o may be replaced by extended shell implementation
+    $debug_cmd
+
+    func_usage_message
+    $ECHO "$long_help_message
+
+MODE must be one of the following:
+
+       clean           remove files from the build directory
+       compile         compile a source file into a libtool object
+       execute         automatically set library path, then run a program
+       finish          complete the installation of libtool libraries
+       install         install libraries or executables
+       link            create a library or an executable
+       uninstall       remove libraries from an installed directory
+
+MODE-ARGS vary depending on the MODE.  When passed as first option,
+'--mode=MODE' may be abbreviated as 'MODE' or a unique abbreviation of that.
+Try '$progname --help --mode=MODE' for a more detailed description of MODE.
+
+When reporting a bug, please describe a test case to reproduce it and
+include the following information:
+
+       host-triplet:   $host
+       shell:          $SHELL
+       compiler:       $LTCC
+       compiler flags: $LTCFLAGS
+       linker:         $LD (gnu? $with_gnu_ld)
+       version:        $progname (GNU libtool) 2.4.6
+       automake:       `($AUTOMAKE --version) 2>/dev/null |$SED 1q`
+       autoconf:       `($AUTOCONF --version) 2>/dev/null |$SED 1q`
+
+Report bugs to <bug-libtool@gnu.org>.
+GNU libtool home page: <http://www.gnu.org/software/libtool/>.
+General help using GNU software: <http://www.gnu.org/gethelp/>."
+    exit 0
+}
 
 
-# func_xform libobj-or-source
-func_xform ()
-{
-    func_xform_result=`$ECHO "${1}" | $SED 's/\.[^.]*$/.lo/'`
-} # func_xform may be replaced by extended shell implementation
+# func_lo2o OBJECT-NAME
+# ---------------------
+# Transform OBJECT-NAME from a '.lo' suffix to the platform specific
+# object suffix.
+
+lo2o=s/\\.lo\$/.$objext/
+o2lo=s/\\.$objext\$/.lo/
+
+if test yes = "$_G_HAVE_XSI_OPS"; then
+  eval 'func_lo2o ()
+  {
+    case $1 in
+      *.lo) func_lo2o_result=${1%.lo}.$objext ;;
+      *   ) func_lo2o_result=$1               ;;
+    esac
+  }'
+
+  # func_xform LIBOBJ-OR-SOURCE
+  # ---------------------------
+  # Transform LIBOBJ-OR-SOURCE from a '.o' or '.c' (or otherwise)
+  # suffix to a '.lo' libtool-object suffix.
+  eval 'func_xform ()
+  {
+    func_xform_result=${1%.*}.lo
+  }'
+else
+  # ...otherwise fall back to using sed.
+  func_lo2o ()
+  {
+    func_lo2o_result=`$ECHO "$1" | $SED "$lo2o"`
+  }
+
+  func_xform ()
+  {
+    func_xform_result=`$ECHO "$1" | $SED 's|\.[^.]*$|.lo|'`
+  }
+fi
 
 
-# func_fatal_configuration arg...
+# func_fatal_configuration ARG...
+# -------------------------------
 # Echo program name prefixed message to standard error, followed by
 # a configuration failure hint, and exit.
 func_fatal_configuration ()
 {
-    func_error ${1+"$@"}
-    func_error "See the $PACKAGE documentation for more information."
-    func_fatal_error "Fatal configuration error."
+    func__fatal_error ${1+"$@"} \
+      "See the $PACKAGE documentation for more information." \
+      "Fatal configuration error."
 }
 
 
 # func_config
+# -----------
 # Display the configuration for all the tags in this script.
 func_config ()
 {
@@ -915,17 +2149,19 @@ func_config ()
     exit $?
 }
 
+
 # func_features
+# -------------
 # Display the features supported by this script.
 func_features ()
 {
     echo "host: $host"
-    if test "$build_libtool_libs" = yes; then
+    if test yes = "$build_libtool_libs"; then
       echo "enable shared libraries"
     else
       echo "disable shared libraries"
     fi
-    if test "$build_old_libs" = yes; then
+    if test yes = "$build_old_libs"; then
       echo "enable static libraries"
     else
       echo "disable static libraries"
@@ -934,314 +2170,350 @@ func_features ()
     exit $?
 }
 
-# func_enable_tag tagname
+
+# func_enable_tag TAGNAME
+# -----------------------
 # Verify that TAGNAME is valid, and either flag an error and exit, or
 # enable the TAGNAME tag.  We also add TAGNAME to the global $taglist
 # variable here.
 func_enable_tag ()
 {
-  # Global variable:
-  tagname="$1"
+    # Global variable:
+    tagname=$1
 
-  re_begincf="^# ### BEGIN LIBTOOL TAG CONFIG: $tagname\$"
-  re_endcf="^# ### END LIBTOOL TAG CONFIG: $tagname\$"
-  sed_extractcf="/$re_begincf/,/$re_endcf/p"
+    re_begincf="^# ### BEGIN LIBTOOL TAG CONFIG: $tagname\$"
+    re_endcf="^# ### END LIBTOOL TAG CONFIG: $tagname\$"
+    sed_extractcf=/$re_begincf/,/$re_endcf/p
 
-  # Validate tagname.
-  case $tagname in
-    *[!-_A-Za-z0-9,/]*)
-      func_fatal_error "invalid tag name: $tagname"
-      ;;
-  esac
+    # Validate tagname.
+    case $tagname in
+      *[!-_A-Za-z0-9,/]*)
+        func_fatal_error "invalid tag name: $tagname"
+        ;;
+    esac
 
-  # Don't test for the "default" C tag, as we know it's
-  # there but not specially marked.
-  case $tagname in
-    CC) ;;
+    # Don't test for the "default" C tag, as we know it's
+    # there but not specially marked.
+    case $tagname in
+        CC) ;;
     *)
-      if $GREP "$re_begincf" "$progpath" >/dev/null 2>&1; then
-	taglist="$taglist $tagname"
-
-	# Evaluate the configuration.  Be careful to quote the path
-	# and the sed script, to avoid splitting on whitespace, but
-	# also don't use non-portable quotes within backquotes within
-	# quotes we have to do it in 2 steps:
-	extractedcf=`$SED -n -e "$sed_extractcf" < "$progpath"`
-	eval "$extractedcf"
-      else
-	func_error "ignoring unknown tag $tagname"
-      fi
-      ;;
-  esac
+        if $GREP "$re_begincf" "$progpath" >/dev/null 2>&1; then
+	  taglist="$taglist $tagname"
+
+	  # Evaluate the configuration.  Be careful to quote the path
+	  # and the sed script, to avoid splitting on whitespace, but
+	  # also don't use non-portable quotes within backquotes within
+	  # quotes we have to do it in 2 steps:
+	  extractedcf=`$SED -n -e "$sed_extractcf" < "$progpath"`
+	  eval "$extractedcf"
+        else
+	  func_error "ignoring unknown tag $tagname"
+        fi
+        ;;
+    esac
 }
 
+
 # func_check_version_match
+# ------------------------
 # Ensure that we are using m4 macros, and libtool script from the same
 # release of libtool.
 func_check_version_match ()
 {
-  if test "$package_revision" != "$macro_revision"; then
-    if test "$VERSION" != "$macro_version"; then
-      if test -z "$macro_version"; then
-        cat >&2 <<_LT_EOF
+    if test "$package_revision" != "$macro_revision"; then
+      if test "$VERSION" != "$macro_version"; then
+        if test -z "$macro_version"; then
+          cat >&2 <<_LT_EOF
 $progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
 $progname: definition of this LT_INIT comes from an older release.
 $progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
 $progname: and run autoconf again.
 _LT_EOF
-      else
-        cat >&2 <<_LT_EOF
+        else
+          cat >&2 <<_LT_EOF
 $progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
 $progname: definition of this LT_INIT comes from $PACKAGE $macro_version.
 $progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
 $progname: and run autoconf again.
 _LT_EOF
-      fi
-    else
-      cat >&2 <<_LT_EOF
+        fi
+      else
+        cat >&2 <<_LT_EOF
 $progname: Version mismatch error.  This is $PACKAGE $VERSION, revision $package_revision,
 $progname: but the definition of this LT_INIT comes from revision $macro_revision.
 $progname: You should recreate aclocal.m4 with macros from revision $package_revision
 $progname: of $PACKAGE $VERSION and run autoconf again.
 _LT_EOF
-    fi
+      fi
 
-    exit $EXIT_MISMATCH
-  fi
+      exit $EXIT_MISMATCH
+    fi
 }
 
 
-# Shorthand for --mode=foo, only valid as the first argument
-case $1 in
-clean|clea|cle|cl)
-  shift; set dummy --mode clean ${1+"$@"}; shift
-  ;;
-compile|compil|compi|comp|com|co|c)
-  shift; set dummy --mode compile ${1+"$@"}; shift
-  ;;
-execute|execut|execu|exec|exe|ex|e)
-  shift; set dummy --mode execute ${1+"$@"}; shift
-  ;;
-finish|finis|fini|fin|fi|f)
-  shift; set dummy --mode finish ${1+"$@"}; shift
-  ;;
-install|instal|insta|inst|ins|in|i)
-  shift; set dummy --mode install ${1+"$@"}; shift
-  ;;
-link|lin|li|l)
-  shift; set dummy --mode link ${1+"$@"}; shift
-  ;;
-uninstall|uninstal|uninsta|uninst|unins|unin|uni|un|u)
-  shift; set dummy --mode uninstall ${1+"$@"}; shift
-  ;;
-esac
+# libtool_options_prep [ARG]...
+# -----------------------------
+# Preparation for options parsed by libtool.
+libtool_options_prep ()
+{
+    $debug_mode
 
+    # Option defaults:
+    opt_config=false
+    opt_dlopen=
+    opt_dry_run=false
+    opt_help=false
+    opt_mode=
+    opt_preserve_dup_deps=false
+    opt_quiet=false
 
+    nonopt=
+    preserve_args=
 
-# Option defaults:
-opt_debug=:
-opt_dry_run=false
-opt_config=false
-opt_preserve_dup_deps=false
-opt_features=false
-opt_finish=false
-opt_help=false
-opt_help_all=false
-opt_silent=:
-opt_warning=:
-opt_verbose=:
-opt_silent=false
-opt_verbose=false
+    # Shorthand for --mode=foo, only valid as the first argument
+    case $1 in
+    clean|clea|cle|cl)
+      shift; set dummy --mode clean ${1+"$@"}; shift
+      ;;
+    compile|compil|compi|comp|com|co|c)
+      shift; set dummy --mode compile ${1+"$@"}; shift
+      ;;
+    execute|execut|execu|exec|exe|ex|e)
+      shift; set dummy --mode execute ${1+"$@"}; shift
+      ;;
+    finish|finis|fini|fin|fi|f)
+      shift; set dummy --mode finish ${1+"$@"}; shift
+      ;;
+    install|instal|insta|inst|ins|in|i)
+      shift; set dummy --mode install ${1+"$@"}; shift
+      ;;
+    link|lin|li|l)
+      shift; set dummy --mode link ${1+"$@"}; shift
+      ;;
+    uninstall|uninstal|uninsta|uninst|unins|unin|uni|un|u)
+      shift; set dummy --mode uninstall ${1+"$@"}; shift
+      ;;
+    esac
+
+    # Pass back the list of options.
+    func_quote_for_eval ${1+"$@"}
+    libtool_options_prep_result=$func_quote_for_eval_result
+}
+func_add_hook func_options_prep libtool_options_prep
 
 
-# Parse options once, thoroughly.  This comes as soon as possible in the
-# script to make things like `--version' happen as quickly as we can.
+# libtool_parse_options [ARG]...
+# ---------------------------------
+# Provide handling for libtool specific options.
+libtool_parse_options ()
 {
-  # this just eases exit handling
-  while test $# -gt 0; do
-    opt="$1"
-    shift
-    case $opt in
-      --debug|-x)	opt_debug='set -x'
-			func_echo "enabling shell trace mode"
-			$opt_debug
-			;;
-      --dry-run|--dryrun|-n)
-			opt_dry_run=:
-			;;
-      --config)
-			opt_config=:
-func_config
-			;;
-      --dlopen|-dlopen)
-			optarg="$1"
-			opt_dlopen="${opt_dlopen+$opt_dlopen
-}$optarg"
-			shift
-			;;
-      --preserve-dup-deps)
-			opt_preserve_dup_deps=:
-			;;
-      --features)
-			opt_features=:
-func_features
-			;;
-      --finish)
-			opt_finish=:
-set dummy --mode finish ${1+"$@"}; shift
-			;;
-      --help)
-			opt_help=:
-			;;
-      --help-all)
-			opt_help_all=:
-opt_help=': help-all'
-			;;
-      --mode)
-			test $# = 0 && func_missing_arg $opt && break
-			optarg="$1"
-			opt_mode="$optarg"
-case $optarg in
-  # Valid mode arguments:
-  clean|compile|execute|finish|install|link|relink|uninstall) ;;
-
-  # Catch anything else as an error
-  *) func_error "invalid argument for $opt"
-     exit_cmd=exit
-     break
-     ;;
-esac
-			shift
-			;;
-      --no-silent|--no-quiet)
-			opt_silent=false
-func_append preserve_args " $opt"
-			;;
-      --no-warning|--no-warn)
-			opt_warning=false
-func_append preserve_args " $opt"
-			;;
-      --no-verbose)
-			opt_verbose=false
-func_append preserve_args " $opt"
-			;;
-      --silent|--quiet)
-			opt_silent=:
-func_append preserve_args " $opt"
-        opt_verbose=false
-			;;
-      --verbose|-v)
-			opt_verbose=:
-func_append preserve_args " $opt"
-opt_silent=false
-			;;
-      --tag)
-			test $# = 0 && func_missing_arg $opt && break
-			optarg="$1"
-			opt_tag="$optarg"
-func_append preserve_args " $opt $optarg"
-func_enable_tag "$optarg"
-			shift
-			;;
-
-      -\?|-h)		func_usage				;;
-      --help)		func_help				;;
-      --version)	func_version				;;
-
-      # Separate optargs to long options:
-      --*=*)
-			func_split_long_opt "$opt"
-			set dummy "$func_split_long_opt_name" "$func_split_long_opt_arg" ${1+"$@"}
-			shift
-			;;
-
-      # Separate non-argument short options:
-      -\?*|-h*|-n*|-v*)
-			func_split_short_opt "$opt"
-			set dummy "$func_split_short_opt_name" "-$func_split_short_opt_arg" ${1+"$@"}
-			shift
-			;;
-
-      --)		break					;;
-      -*)		func_fatal_help "unrecognized option \`$opt'" ;;
-      *)		set dummy "$opt" ${1+"$@"};	shift; break  ;;
-    esac
-  done
+    $debug_cmd
 
-  # Validate options:
+    # Perform our own loop to consume as many options as possible in
+    # each iteration.
+    while test $# -gt 0; do
+      _G_opt=$1
+      shift
+      case $_G_opt in
+        --dry-run|--dryrun|-n)
+                        opt_dry_run=:
+                        ;;
+
+        --config)       func_config ;;
+
+        --dlopen|-dlopen)
+                        opt_dlopen="${opt_dlopen+$opt_dlopen
+}$1"
+                        shift
+                        ;;
+
+        --preserve-dup-deps)
+                        opt_preserve_dup_deps=: ;;
+
+        --features)     func_features ;;
+
+        --finish)       set dummy --mode finish ${1+"$@"}; shift ;;
+
+        --help)         opt_help=: ;;
+
+        --help-all)     opt_help=': help-all' ;;
+
+        --mode)         test $# = 0 && func_missing_arg $_G_opt && break
+                        opt_mode=$1
+                        case $1 in
+                          # Valid mode arguments:
+                          clean|compile|execute|finish|install|link|relink|uninstall) ;;
+
+                          # Catch anything else as an error
+                          *) func_error "invalid argument for $_G_opt"
+                             exit_cmd=exit
+                             break
+                             ;;
+                        esac
+                        shift
+                        ;;
+
+        --no-silent|--no-quiet)
+                        opt_quiet=false
+                        func_append preserve_args " $_G_opt"
+                        ;;
+
+        --no-warnings|--no-warning|--no-warn)
+                        opt_warning=false
+                        func_append preserve_args " $_G_opt"
+                        ;;
+
+        --no-verbose)
+                        opt_verbose=false
+                        func_append preserve_args " $_G_opt"
+                        ;;
+
+        --silent|--quiet)
+                        opt_quiet=:
+                        opt_verbose=false
+                        func_append preserve_args " $_G_opt"
+                        ;;
+
+        --tag)          test $# = 0 && func_missing_arg $_G_opt && break
+                        opt_tag=$1
+                        func_append preserve_args " $_G_opt $1"
+                        func_enable_tag "$1"
+                        shift
+                        ;;
+
+        --verbose|-v)   opt_quiet=false
+                        opt_verbose=:
+                        func_append preserve_args " $_G_opt"
+                        ;;
+
+	# An option not handled by this hook function:
+        *)		set dummy "$_G_opt" ${1+"$@"};	shift; break  ;;
+      esac
+    done
 
-  # save first non-option argument
-  if test "$#" -gt 0; then
-    nonopt="$opt"
-    shift
-  fi
 
-  # preserve --debug
-  test "$opt_debug" = : || func_append preserve_args " --debug"
+    # save modified positional parameters for caller
+    func_quote_for_eval ${1+"$@"}
+    libtool_parse_options_result=$func_quote_for_eval_result
+}
+func_add_hook func_parse_options libtool_parse_options
 
-  case $host in
-    *cygwin* | *mingw* | *pw32* | *cegcc*)
-      # don't eliminate duplications in $postdeps and $predeps
-      opt_duplicate_compiler_generated_deps=:
-      ;;
-    *)
-      opt_duplicate_compiler_generated_deps=$opt_preserve_dup_deps
-      ;;
-  esac
 
-  $opt_help || {
-    # Sanity checks first:
-    func_check_version_match
 
-    if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
-      func_fatal_configuration "not configured to build any kind of library"
+# libtool_validate_options [ARG]...
+# ---------------------------------
+# Perform any sanity checks on option settings and/or unconsumed
+# arguments.
+libtool_validate_options ()
+{
+    # save first non-option argument
+    if test 0 -lt $#; then
+      nonopt=$1
+      shift
     fi
 
-    # Darwin sucks
-    eval std_shrext=\"$shrext_cmds\"
+    # preserve --debug
+    test : = "$debug_cmd" || func_append preserve_args " --debug"
 
-    # Only execute mode is allowed to have -dlopen flags.
-    if test -n "$opt_dlopen" && test "$opt_mode" != execute; then
-      func_error "unrecognized option \`-dlopen'"
-      $ECHO "$help" 1>&2
-      exit $EXIT_FAILURE
-    fi
+    case $host in
+      # Solaris2 added to fix http://debbugs.gnu.org/cgi/bugreport.cgi?bug=16452
+      # see also: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59788
+      *cygwin* | *mingw* | *pw32* | *cegcc* | *solaris2* | *os2*)
+        # don't eliminate duplications in $postdeps and $predeps
+        opt_duplicate_compiler_generated_deps=:
+        ;;
+      *)
+        opt_duplicate_compiler_generated_deps=$opt_preserve_dup_deps
+        ;;
+    esac
 
-    # Change the help message to a mode-specific one.
-    generic_help="$help"
-    help="Try \`$progname --help --mode=$opt_mode' for more information."
-  }
+    $opt_help || {
+      # Sanity checks first:
+      func_check_version_match
+
+      test yes != "$build_libtool_libs" \
+        && test yes != "$build_old_libs" \
+        && func_fatal_configuration "not configured to build any kind of library"
+
+      # Darwin sucks
+      eval std_shrext=\"$shrext_cmds\"
+
+      # Only execute mode is allowed to have -dlopen flags.
+      if test -n "$opt_dlopen" && test execute != "$opt_mode"; then
+        func_error "unrecognized option '-dlopen'"
+        $ECHO "$help" 1>&2
+        exit $EXIT_FAILURE
+      fi
 
+      # Change the help message to a mode-specific one.
+      generic_help=$help
+      help="Try '$progname --help --mode=$opt_mode' for more information."
+    }
 
-  # Bail if the options were screwed
-  $exit_cmd $EXIT_FAILURE
+    # Pass back the unparsed argument list
+    func_quote_for_eval ${1+"$@"}
+    libtool_validate_options_result=$func_quote_for_eval_result
 }
+func_add_hook func_validate_options libtool_validate_options
 
 
+# Process options as early as possible so that --help and --version
+# can return quickly.
+func_options ${1+"$@"}
+eval set dummy "$func_options_result"; shift
+
 
 
 ## ----------- ##
 ##    Main.    ##
 ## ----------- ##
 
+magic='%%%MAGIC variable%%%'
+magic_exe='%%%MAGIC EXE variable%%%'
+
+# Global variables.
+extracted_archives=
+extracted_serial=0
+
+# If this variable is set in any of the actions, the command in it
+# will be execed at the end.  This prevents here-documents from being
+# left over by shells.
+exec_cmd=
+
+
+# A function that is used when there is no print builtin or printf.
+func_fallback_echo ()
+{
+  eval 'cat <<_LTECHO_EOF
+$1
+_LTECHO_EOF'
+}
+
+# func_generated_by_libtool
+# True iff stdin has been generated by Libtool. This function is only
+# a basic sanity check; it will hardly flush out determined imposters.
+func_generated_by_libtool_p ()
+{
+  $GREP "^# Generated by .*$PACKAGE" > /dev/null 2>&1
+}
+
 # func_lalib_p file
-# True iff FILE is a libtool `.la' library or `.lo' object file.
+# True iff FILE is a libtool '.la' library or '.lo' object file.
 # This function is only a basic sanity check; it will hardly flush out
 # determined imposters.
 func_lalib_p ()
 {
     test -f "$1" &&
-      $SED -e 4q "$1" 2>/dev/null \
-        | $GREP "^# Generated by .*$PACKAGE" > /dev/null 2>&1
+      $SED -e 4q "$1" 2>/dev/null | func_generated_by_libtool_p
 }
 
 # func_lalib_unsafe_p file
-# True iff FILE is a libtool `.la' library or `.lo' object file.
+# True iff FILE is a libtool '.la' library or '.lo' object file.
 # This function implements the same check as func_lalib_p without
 # resorting to external programs.  To this end, it redirects stdin and
 # closes it afterwards, without saving the original file descriptor.
 # As a safety measure, use it only where a negative result would be
-# fatal anyway.  Works if `file' does not exist.
+# fatal anyway.  Works if 'file' does not exist.
 func_lalib_unsafe_p ()
 {
     lalib_p=no
@@ -1249,13 +2521,13 @@ func_lalib_unsafe_p ()
 	for lalib_p_l in 1 2 3 4
 	do
 	    read lalib_p_line
-	    case "$lalib_p_line" in
+	    case $lalib_p_line in
 		\#\ Generated\ by\ *$PACKAGE* ) lalib_p=yes; break;;
 	    esac
 	done
 	exec 0<&5 5<&-
     fi
-    test "$lalib_p" = yes
+    test yes = "$lalib_p"
 }
 
 # func_ltwrapper_script_p file
@@ -1264,7 +2536,8 @@ func_lalib_unsafe_p ()
 # determined imposters.
 func_ltwrapper_script_p ()
 {
-    func_lalib_p "$1"
+    test -f "$1" &&
+      $lt_truncate_bin < "$1" 2>/dev/null | func_generated_by_libtool_p
 }
 
 # func_ltwrapper_executable_p file
@@ -1289,7 +2562,7 @@ func_ltwrapper_scriptname ()
 {
     func_dirname_and_basename "$1" "" "."
     func_stripname '' '.exe' "$func_basename_result"
-    func_ltwrapper_scriptname_result="$func_dirname_result/$objdir/${func_stripname_result}_ltshwrapper"
+    func_ltwrapper_scriptname_result=$func_dirname_result/$objdir/${func_stripname_result}_ltshwrapper
 }
 
 # func_ltwrapper_p file
@@ -1308,11 +2581,13 @@ func_ltwrapper_p ()
 # FAIL_CMD may read-access the current command in variable CMD!
 func_execute_cmds ()
 {
-    $opt_debug
+    $debug_cmd
+
     save_ifs=$IFS; IFS='~'
     for cmd in $1; do
-      IFS=$save_ifs
+      IFS=$sp$nl
       eval cmd=\"$cmd\"
+      IFS=$save_ifs
       func_show_eval "$cmd" "${2-:}"
     done
     IFS=$save_ifs
@@ -1324,10 +2599,11 @@ func_execute_cmds ()
 # Note that it is not necessary on cygwin/mingw to append a dot to
 # FILE even if both FILE and FILE.exe exist: automatic-append-.exe
 # behavior happens only for exec(3), not for open(2)!  Also, sourcing
-# `FILE.' does not work on cygwin managed mounts.
+# 'FILE.' does not work on cygwin managed mounts.
 func_source ()
 {
-    $opt_debug
+    $debug_cmd
+
     case $1 in
     */* | *\\*)	. "$1" ;;
     *)		. "./$1" ;;
@@ -1354,10 +2630,10 @@ func_resolve_sysroot ()
 # store the result into func_replace_sysroot_result.
 func_replace_sysroot ()
 {
-  case "$lt_sysroot:$1" in
+  case $lt_sysroot:$1 in
   ?*:"$lt_sysroot"*)
     func_stripname "$lt_sysroot" '' "$1"
-    func_replace_sysroot_result="=$func_stripname_result"
+    func_replace_sysroot_result='='$func_stripname_result
     ;;
   *)
     # Including no sysroot.
@@ -1374,7 +2650,8 @@ func_replace_sysroot ()
 # arg is usually of the form 'gcc ...'
 func_infer_tag ()
 {
-    $opt_debug
+    $debug_cmd
+
     if test -n "$available_tags" && test -z "$tagname"; then
       CC_quoted=
       for arg in $CC; do
@@ -1393,7 +2670,7 @@ func_infer_tag ()
 	for z in $available_tags; do
 	  if $GREP "^# ### BEGIN LIBTOOL TAG CONFIG: $z$" < "$progpath" > /dev/null; then
 	    # Evaluate the configuration.
-	    eval "`${SED} -n -e '/^# ### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^# ### END LIBTOOL TAG CONFIG: '$z'$/p' < $progpath`"
+	    eval "`$SED -n -e '/^# ### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^# ### END LIBTOOL TAG CONFIG: '$z'$/p' < $progpath`"
 	    CC_quoted=
 	    for arg in $CC; do
 	      # Double-quote args containing other shell metacharacters.
@@ -1418,7 +2695,7 @@ func_infer_tag ()
 	# line option must be used.
 	if test -z "$tagname"; then
 	  func_echo "unable to infer tagged configuration"
-	  func_fatal_error "specify a tag with \`--tag'"
+	  func_fatal_error "specify a tag with '--tag'"
 #	else
 #	  func_verbose "using $tagname tagged configuration"
 	fi
@@ -1434,15 +2711,15 @@ func_infer_tag ()
 # but don't create it if we're doing a dry run.
 func_write_libtool_object ()
 {
-    write_libobj=${1}
-    if test "$build_libtool_libs" = yes; then
-      write_lobj=\'${2}\'
+    write_libobj=$1
+    if test yes = "$build_libtool_libs"; then
+      write_lobj=\'$2\'
     else
       write_lobj=none
     fi
 
-    if test "$build_old_libs" = yes; then
-      write_oldobj=\'${3}\'
+    if test yes = "$build_old_libs"; then
+      write_oldobj=\'$3\'
     else
       write_oldobj=none
     fi
@@ -1450,7 +2727,7 @@ func_write_libtool_object ()
     $opt_dry_run || {
       cat >${write_libobj}T <<EOF
 # $write_libobj - a libtool object file
-# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
 #
 # Please DO NOT delete this file!
 # It is necessary for linking the library.
@@ -1462,7 +2739,7 @@ pic_object=$write_lobj
 non_pic_object=$write_oldobj
 
 EOF
-      $MV "${write_libobj}T" "${write_libobj}"
+      $MV "${write_libobj}T" "$write_libobj"
     }
 }
 
@@ -1482,8 +2759,9 @@ EOF
 # be empty on error (or when ARG is empty)
 func_convert_core_file_wine_to_w32 ()
 {
-  $opt_debug
-  func_convert_core_file_wine_to_w32_result="$1"
+  $debug_cmd
+
+  func_convert_core_file_wine_to_w32_result=$1
   if test -n "$1"; then
     # Unfortunately, winepath does not exit with a non-zero error code, so we
     # are forced to check the contents of stdout. On the other hand, if the
@@ -1491,9 +2769,9 @@ func_convert_core_file_wine_to_w32 ()
     # *an error message* to stdout. So we must check for both error code of
     # zero AND non-empty stdout, which explains the odd construction:
     func_convert_core_file_wine_to_w32_tmp=`winepath -w "$1" 2>/dev/null`
-    if test "$?" -eq 0 && test -n "${func_convert_core_file_wine_to_w32_tmp}"; then
+    if test "$?" -eq 0 && test -n "$func_convert_core_file_wine_to_w32_tmp"; then
       func_convert_core_file_wine_to_w32_result=`$ECHO "$func_convert_core_file_wine_to_w32_tmp" |
-        $SED -e "$lt_sed_naive_backslashify"`
+        $SED -e "$sed_naive_backslashify"`
     else
       func_convert_core_file_wine_to_w32_result=
     fi
@@ -1514,18 +2792,19 @@ func_convert_core_file_wine_to_w32 ()
 # are convertible, then the result may be empty.
 func_convert_core_path_wine_to_w32 ()
 {
-  $opt_debug
+  $debug_cmd
+
   # unfortunately, winepath doesn't convert paths, only file names
-  func_convert_core_path_wine_to_w32_result=""
+  func_convert_core_path_wine_to_w32_result=
   if test -n "$1"; then
     oldIFS=$IFS
     IFS=:
     for func_convert_core_path_wine_to_w32_f in $1; do
       IFS=$oldIFS
       func_convert_core_file_wine_to_w32 "$func_convert_core_path_wine_to_w32_f"
-      if test -n "$func_convert_core_file_wine_to_w32_result" ; then
+      if test -n "$func_convert_core_file_wine_to_w32_result"; then
         if test -z "$func_convert_core_path_wine_to_w32_result"; then
-          func_convert_core_path_wine_to_w32_result="$func_convert_core_file_wine_to_w32_result"
+          func_convert_core_path_wine_to_w32_result=$func_convert_core_file_wine_to_w32_result
         else
           func_append func_convert_core_path_wine_to_w32_result ";$func_convert_core_file_wine_to_w32_result"
         fi
@@ -1554,7 +2833,8 @@ func_convert_core_path_wine_to_w32 ()
 # environment variable; do not put it in $PATH.
 func_cygpath ()
 {
-  $opt_debug
+  $debug_cmd
+
   if test -n "$LT_CYGPATH" && test -f "$LT_CYGPATH"; then
     func_cygpath_result=`$LT_CYGPATH "$@" 2>/dev/null`
     if test "$?" -ne 0; then
@@ -1563,7 +2843,7 @@ func_cygpath ()
     fi
   else
     func_cygpath_result=
-    func_error "LT_CYGPATH is empty or specifies non-existent file: \`$LT_CYGPATH'"
+    func_error "LT_CYGPATH is empty or specifies non-existent file: '$LT_CYGPATH'"
   fi
 }
 #end: func_cygpath
@@ -1574,10 +2854,11 @@ func_cygpath ()
 # result in func_convert_core_msys_to_w32_result.
 func_convert_core_msys_to_w32 ()
 {
-  $opt_debug
+  $debug_cmd
+
   # awkward: cmd appends spaces to result
   func_convert_core_msys_to_w32_result=`( cmd //c echo "$1" ) 2>/dev/null |
-    $SED -e 's/[ ]*$//' -e "$lt_sed_naive_backslashify"`
+    $SED -e 's/[ ]*$//' -e "$sed_naive_backslashify"`
 }
 #end: func_convert_core_msys_to_w32
 
@@ -1588,13 +2869,14 @@ func_convert_core_msys_to_w32 ()
 # func_to_host_file_result to ARG1).
 func_convert_file_check ()
 {
-  $opt_debug
-  if test -z "$2" && test -n "$1" ; then
+  $debug_cmd
+
+  if test -z "$2" && test -n "$1"; then
     func_error "Could not determine host file name corresponding to"
-    func_error "  \`$1'"
+    func_error "  '$1'"
     func_error "Continuing, but uninstalled executables may not work."
     # Fallback:
-    func_to_host_file_result="$1"
+    func_to_host_file_result=$1
   fi
 }
 # end func_convert_file_check
@@ -1606,10 +2888,11 @@ func_convert_file_check ()
 # func_to_host_file_result to a simplistic fallback value (see below).
 func_convert_path_check ()
 {
-  $opt_debug
+  $debug_cmd
+
   if test -z "$4" && test -n "$3"; then
     func_error "Could not determine the host path corresponding to"
-    func_error "  \`$3'"
+    func_error "  '$3'"
     func_error "Continuing, but uninstalled executables may not work."
     # Fallback.  This is a deliberately simplistic "conversion" and
     # should not be "improved".  See libtool.info.
@@ -1618,7 +2901,7 @@ func_convert_path_check ()
       func_to_host_path_result=`echo "$3" |
         $SED -e "$lt_replace_pathsep_chars"`
     else
-      func_to_host_path_result="$3"
+      func_to_host_path_result=$3
     fi
   fi
 }
@@ -1630,9 +2913,10 @@ func_convert_path_check ()
 # and appending REPL if ORIG matches BACKPAT.
 func_convert_path_front_back_pathsep ()
 {
-  $opt_debug
+  $debug_cmd
+
   case $4 in
-  $1 ) func_to_host_path_result="$3$func_to_host_path_result"
+  $1 ) func_to_host_path_result=$3$func_to_host_path_result
     ;;
   esac
   case $4 in
@@ -1646,7 +2930,7 @@ func_convert_path_front_back_pathsep ()
 ##################################################
 # $build to $host FILE NAME CONVERSION FUNCTIONS #
 ##################################################
-# invoked via `$to_host_file_cmd ARG'
+# invoked via '$to_host_file_cmd ARG'
 #
 # In each case, ARG is the path to be converted from $build to $host format.
 # Result will be available in $func_to_host_file_result.
@@ -1657,7 +2941,8 @@ func_convert_path_front_back_pathsep ()
 # in func_to_host_file_result.
 func_to_host_file ()
 {
-  $opt_debug
+  $debug_cmd
+
   $to_host_file_cmd "$1"
 }
 # end func_to_host_file
@@ -1669,7 +2954,8 @@ func_to_host_file ()
 # in (the comma separated) LAZY, no conversion takes place.
 func_to_tool_file ()
 {
-  $opt_debug
+  $debug_cmd
+
   case ,$2, in
     *,"$to_tool_file_cmd",*)
       func_to_tool_file_result=$1
@@ -1687,7 +2973,7 @@ func_to_tool_file ()
 # Copy ARG to func_to_host_file_result.
 func_convert_file_noop ()
 {
-  func_to_host_file_result="$1"
+  func_to_host_file_result=$1
 }
 # end func_convert_file_noop
 
@@ -1698,11 +2984,12 @@ func_convert_file_noop ()
 # func_to_host_file_result.
 func_convert_file_msys_to_w32 ()
 {
-  $opt_debug
-  func_to_host_file_result="$1"
+  $debug_cmd
+
+  func_to_host_file_result=$1
   if test -n "$1"; then
     func_convert_core_msys_to_w32 "$1"
-    func_to_host_file_result="$func_convert_core_msys_to_w32_result"
+    func_to_host_file_result=$func_convert_core_msys_to_w32_result
   fi
   func_convert_file_check "$1" "$func_to_host_file_result"
 }
@@ -1714,8 +3001,9 @@ func_convert_file_msys_to_w32 ()
 # func_to_host_file_result.
 func_convert_file_cygwin_to_w32 ()
 {
-  $opt_debug
-  func_to_host_file_result="$1"
+  $debug_cmd
+
+  func_to_host_file_result=$1
   if test -n "$1"; then
     # because $build is cygwin, we call "the" cygpath in $PATH; no need to use
     # LT_CYGPATH in this case.
@@ -1731,11 +3019,12 @@ func_convert_file_cygwin_to_w32 ()
 # and a working winepath. Returns result in func_to_host_file_result.
 func_convert_file_nix_to_w32 ()
 {
-  $opt_debug
-  func_to_host_file_result="$1"
+  $debug_cmd
+
+  func_to_host_file_result=$1
   if test -n "$1"; then
     func_convert_core_file_wine_to_w32 "$1"
-    func_to_host_file_result="$func_convert_core_file_wine_to_w32_result"
+    func_to_host_file_result=$func_convert_core_file_wine_to_w32_result
   fi
   func_convert_file_check "$1" "$func_to_host_file_result"
 }
@@ -1747,12 +3036,13 @@ func_convert_file_nix_to_w32 ()
 # Returns result in func_to_host_file_result.
 func_convert_file_msys_to_cygwin ()
 {
-  $opt_debug
-  func_to_host_file_result="$1"
+  $debug_cmd
+
+  func_to_host_file_result=$1
   if test -n "$1"; then
     func_convert_core_msys_to_w32 "$1"
     func_cygpath -u "$func_convert_core_msys_to_w32_result"
-    func_to_host_file_result="$func_cygpath_result"
+    func_to_host_file_result=$func_cygpath_result
   fi
   func_convert_file_check "$1" "$func_to_host_file_result"
 }
@@ -1765,13 +3055,14 @@ func_convert_file_msys_to_cygwin ()
 # in func_to_host_file_result.
 func_convert_file_nix_to_cygwin ()
 {
-  $opt_debug
-  func_to_host_file_result="$1"
+  $debug_cmd
+
+  func_to_host_file_result=$1
   if test -n "$1"; then
     # convert from *nix to w32, then use cygpath to convert from w32 to cygwin.
     func_convert_core_file_wine_to_w32 "$1"
     func_cygpath -u "$func_convert_core_file_wine_to_w32_result"
-    func_to_host_file_result="$func_cygpath_result"
+    func_to_host_file_result=$func_cygpath_result
   fi
   func_convert_file_check "$1" "$func_to_host_file_result"
 }
@@ -1781,7 +3072,7 @@ func_convert_file_nix_to_cygwin ()
 #############################################
 # $build to $host PATH CONVERSION FUNCTIONS #
 #############################################
-# invoked via `$to_host_path_cmd ARG'
+# invoked via '$to_host_path_cmd ARG'
 #
 # In each case, ARG is the path to be converted from $build to $host format.
 # The result will be available in $func_to_host_path_result.
@@ -1805,10 +3096,11 @@ func_convert_file_nix_to_cygwin ()
 to_host_path_cmd=
 func_init_to_host_path_cmd ()
 {
-  $opt_debug
+  $debug_cmd
+
   if test -z "$to_host_path_cmd"; then
     func_stripname 'func_convert_file_' '' "$to_host_file_cmd"
-    to_host_path_cmd="func_convert_path_${func_stripname_result}"
+    to_host_path_cmd=func_convert_path_$func_stripname_result
   fi
 }
 
@@ -1818,7 +3110,8 @@ func_init_to_host_path_cmd ()
 # in func_to_host_path_result.
 func_to_host_path ()
 {
-  $opt_debug
+  $debug_cmd
+
   func_init_to_host_path_cmd
   $to_host_path_cmd "$1"
 }
@@ -1829,7 +3122,7 @@ func_to_host_path ()
 # Copy ARG to func_to_host_path_result.
 func_convert_path_noop ()
 {
-  func_to_host_path_result="$1"
+  func_to_host_path_result=$1
 }
 # end func_convert_path_noop
 
@@ -1840,8 +3133,9 @@ func_convert_path_noop ()
 # func_to_host_path_result.
 func_convert_path_msys_to_w32 ()
 {
-  $opt_debug
-  func_to_host_path_result="$1"
+  $debug_cmd
+
+  func_to_host_path_result=$1
   if test -n "$1"; then
     # Remove leading and trailing path separator characters from ARG.  MSYS
     # behavior is inconsistent here; cygpath turns them into '.;' and ';.';
@@ -1849,7 +3143,7 @@ func_convert_path_msys_to_w32 ()
     func_stripname : : "$1"
     func_to_host_path_tmp1=$func_stripname_result
     func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
-    func_to_host_path_result="$func_convert_core_msys_to_w32_result"
+    func_to_host_path_result=$func_convert_core_msys_to_w32_result
     func_convert_path_check : ";" \
       "$func_to_host_path_tmp1" "$func_to_host_path_result"
     func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
@@ -1863,8 +3157,9 @@ func_convert_path_msys_to_w32 ()
 # func_to_host_file_result.
 func_convert_path_cygwin_to_w32 ()
 {
-  $opt_debug
-  func_to_host_path_result="$1"
+  $debug_cmd
+
+  func_to_host_path_result=$1
   if test -n "$1"; then
     # See func_convert_path_msys_to_w32:
     func_stripname : : "$1"
@@ -1883,14 +3178,15 @@ func_convert_path_cygwin_to_w32 ()
 # a working winepath.  Returns result in func_to_host_file_result.
 func_convert_path_nix_to_w32 ()
 {
-  $opt_debug
-  func_to_host_path_result="$1"
+  $debug_cmd
+
+  func_to_host_path_result=$1
   if test -n "$1"; then
     # See func_convert_path_msys_to_w32:
     func_stripname : : "$1"
     func_to_host_path_tmp1=$func_stripname_result
     func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
-    func_to_host_path_result="$func_convert_core_path_wine_to_w32_result"
+    func_to_host_path_result=$func_convert_core_path_wine_to_w32_result
     func_convert_path_check : ";" \
       "$func_to_host_path_tmp1" "$func_to_host_path_result"
     func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
@@ -1904,15 +3200,16 @@ func_convert_path_nix_to_w32 ()
 # Returns result in func_to_host_file_result.
 func_convert_path_msys_to_cygwin ()
 {
-  $opt_debug
-  func_to_host_path_result="$1"
+  $debug_cmd
+
+  func_to_host_path_result=$1
   if test -n "$1"; then
     # See func_convert_path_msys_to_w32:
     func_stripname : : "$1"
     func_to_host_path_tmp1=$func_stripname_result
     func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
     func_cygpath -u -p "$func_convert_core_msys_to_w32_result"
-    func_to_host_path_result="$func_cygpath_result"
+    func_to_host_path_result=$func_cygpath_result
     func_convert_path_check : : \
       "$func_to_host_path_tmp1" "$func_to_host_path_result"
     func_convert_path_front_back_pathsep ":*" "*:" : "$1"
@@ -1927,8 +3224,9 @@ func_convert_path_msys_to_cygwin ()
 # func_to_host_file_result.
 func_convert_path_nix_to_cygwin ()
 {
-  $opt_debug
-  func_to_host_path_result="$1"
+  $debug_cmd
+
+  func_to_host_path_result=$1
   if test -n "$1"; then
     # Remove leading and trailing path separator characters from
     # ARG. msys behavior is inconsistent here, cygpath turns them
@@ -1937,7 +3235,7 @@ func_convert_path_nix_to_cygwin ()
     func_to_host_path_tmp1=$func_stripname_result
     func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
     func_cygpath -u -p "$func_convert_core_path_wine_to_w32_result"
-    func_to_host_path_result="$func_cygpath_result"
+    func_to_host_path_result=$func_cygpath_result
     func_convert_path_check : : \
       "$func_to_host_path_tmp1" "$func_to_host_path_result"
     func_convert_path_front_back_pathsep ":*" "*:" : "$1"
@@ -1946,13 +3244,31 @@ func_convert_path_nix_to_cygwin ()
 # end func_convert_path_nix_to_cygwin
 
 
+# func_dll_def_p FILE
+# True iff FILE is a Windows DLL '.def' file.
+# Keep in sync with _LT_DLL_DEF_P in libtool.m4
+func_dll_def_p ()
+{
+  $debug_cmd
+
+  func_dll_def_p_tmp=`$SED -n \
+    -e 's/^[	 ]*//' \
+    -e '/^\(;.*\)*$/d' \
+    -e 's/^\(EXPORTS\|LIBRARY\)\([	 ].*\)*$/DEF/p' \
+    -e q \
+    "$1"`
+  test DEF = "$func_dll_def_p_tmp"
+}
+
+
 # func_mode_compile arg...
 func_mode_compile ()
 {
-    $opt_debug
+    $debug_cmd
+
     # Get the compilation command and the source file.
     base_compile=
-    srcfile="$nonopt"  #  always keep a non-empty value in "srcfile"
+    srcfile=$nonopt  #  always keep a non-empty value in "srcfile"
     suppress_opt=yes
     suppress_output=
     arg_mode=normal
@@ -1965,12 +3281,12 @@ func_mode_compile ()
       case $arg_mode in
       arg  )
 	# do not "continue".  Instead, add this to base_compile
-	lastarg="$arg"
+	lastarg=$arg
 	arg_mode=normal
 	;;
 
       target )
-	libobj="$arg"
+	libobj=$arg
 	arg_mode=normal
 	continue
 	;;
@@ -1980,7 +3296,7 @@ func_mode_compile ()
 	case $arg in
 	-o)
 	  test -n "$libobj" && \
-	    func_fatal_error "you cannot specify \`-o' more than once"
+	    func_fatal_error "you cannot specify '-o' more than once"
 	  arg_mode=target
 	  continue
 	  ;;
@@ -2009,12 +3325,12 @@ func_mode_compile ()
 	  func_stripname '-Wc,' '' "$arg"
 	  args=$func_stripname_result
 	  lastarg=
-	  save_ifs="$IFS"; IFS=','
+	  save_ifs=$IFS; IFS=,
 	  for arg in $args; do
-	    IFS="$save_ifs"
+	    IFS=$save_ifs
 	    func_append_quoted lastarg "$arg"
 	  done
-	  IFS="$save_ifs"
+	  IFS=$save_ifs
 	  func_stripname ' ' '' "$lastarg"
 	  lastarg=$func_stripname_result
 
@@ -2027,8 +3343,8 @@ func_mode_compile ()
 	  # Accept the current argument as the source file.
 	  # The previous "srcfile" becomes the current argument.
 	  #
-	  lastarg="$srcfile"
-	  srcfile="$arg"
+	  lastarg=$srcfile
+	  srcfile=$arg
 	  ;;
 	esac  #  case $arg
 	;;
@@ -2043,13 +3359,13 @@ func_mode_compile ()
       func_fatal_error "you must specify an argument for -Xcompile"
       ;;
     target)
-      func_fatal_error "you must specify a target with \`-o'"
+      func_fatal_error "you must specify a target with '-o'"
       ;;
     *)
       # Get the name of the library object.
       test -z "$libobj" && {
 	func_basename "$srcfile"
-	libobj="$func_basename_result"
+	libobj=$func_basename_result
       }
       ;;
     esac
@@ -2069,7 +3385,7 @@ func_mode_compile ()
     case $libobj in
     *.lo) func_lo2o "$libobj"; obj=$func_lo2o_result ;;
     *)
-      func_fatal_error "cannot determine name of library object from \`$libobj'"
+      func_fatal_error "cannot determine name of library object from '$libobj'"
       ;;
     esac
 
@@ -2078,8 +3394,8 @@ func_mode_compile ()
     for arg in $later; do
       case $arg in
       -shared)
-	test "$build_libtool_libs" != yes && \
-	  func_fatal_configuration "can not build a shared library"
+	test yes = "$build_libtool_libs" \
+	  || func_fatal_configuration "cannot build a shared library"
 	build_old_libs=no
 	continue
 	;;
@@ -2105,17 +3421,17 @@ func_mode_compile ()
     func_quote_for_eval "$libobj"
     test "X$libobj" != "X$func_quote_for_eval_result" \
       && $ECHO "X$libobj" | $GREP '[]~#^*{};<>?"'"'"'	 &()|`$[]' \
-      && func_warning "libobj name \`$libobj' may not contain shell special characters."
+      && func_warning "libobj name '$libobj' may not contain shell special characters."
     func_dirname_and_basename "$obj" "/" ""
-    objname="$func_basename_result"
-    xdir="$func_dirname_result"
-    lobj=${xdir}$objdir/$objname
+    objname=$func_basename_result
+    xdir=$func_dirname_result
+    lobj=$xdir$objdir/$objname
 
     test -z "$base_compile" && \
       func_fatal_help "you must specify a compilation command"
 
     # Delete any leftover library objects.
-    if test "$build_old_libs" = yes; then
+    if test yes = "$build_old_libs"; then
       removelist="$obj $lobj $libobj ${libobj}T"
     else
       removelist="$lobj $libobj ${libobj}T"
@@ -2127,16 +3443,16 @@ func_mode_compile ()
       pic_mode=default
       ;;
     esac
-    if test "$pic_mode" = no && test "$deplibs_check_method" != pass_all; then
+    if test no = "$pic_mode" && test pass_all != "$deplibs_check_method"; then
       # non-PIC code in shared libraries is not supported
       pic_mode=default
     fi
 
     # Calculate the filename of the output object if compiler does
     # not support -o with -c
-    if test "$compiler_c_o" = no; then
-      output_obj=`$ECHO "$srcfile" | $SED 's%^.*/%%; s%\.[^.]*$%%'`.${objext}
-      lockfile="$output_obj.lock"
+    if test no = "$compiler_c_o"; then
+      output_obj=`$ECHO "$srcfile" | $SED 's%^.*/%%; s%\.[^.]*$%%'`.$objext
+      lockfile=$output_obj.lock
     else
       output_obj=
       need_locks=no
@@ -2145,12 +3461,12 @@ func_mode_compile ()
 
     # Lock this critical section if it is needed
     # We use this script file to make the link, it avoids creating a new file
-    if test "$need_locks" = yes; then
+    if test yes = "$need_locks"; then
       until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
 	func_echo "Waiting for $lockfile to be removed"
 	sleep 2
       done
-    elif test "$need_locks" = warn; then
+    elif test warn = "$need_locks"; then
       if test -f "$lockfile"; then
 	$ECHO "\
 *** ERROR, $lockfile exists and contains:
@@ -2158,7 +3474,7 @@ func_mode_compile ()
 
 This indicates that another process is trying to use the same
 temporary object file, and libtool could not work around it because
-your compiler does not support \`-c' and \`-o' together.  If you
+your compiler does not support '-c' and '-o' together.  If you
 repeat this compilation, it may succeed, by chance, but you had better
 avoid parallel builds (make -j) in this platform, or get a better
 compiler."
@@ -2180,11 +3496,11 @@ compiler."
     qsrcfile=$func_quote_for_eval_result
 
     # Only build a PIC object if we are building libtool libraries.
-    if test "$build_libtool_libs" = yes; then
+    if test yes = "$build_libtool_libs"; then
       # Without this assignment, base_compile gets emptied.
       fbsd_hideous_sh_bug=$base_compile
 
-      if test "$pic_mode" != no; then
+      if test no != "$pic_mode"; then
 	command="$base_compile $qsrcfile $pic_flag"
       else
 	# Don't build PIC code
@@ -2201,7 +3517,7 @@ compiler."
       func_show_eval_locale "$command"	\
           'test -n "$output_obj" && $RM $removelist; exit $EXIT_FAILURE'
 
-      if test "$need_locks" = warn &&
+      if test warn = "$need_locks" &&
 	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
 	$ECHO "\
 *** ERROR, $lockfile contains:
@@ -2212,7 +3528,7 @@ $srcfile
 
 This indicates that another process is trying to use the same
 temporary object file, and libtool could not work around it because
-your compiler does not support \`-c' and \`-o' together.  If you
+your compiler does not support '-c' and '-o' together.  If you
 repeat this compilation, it may succeed, by chance, but you had better
 avoid parallel builds (make -j) in this platform, or get a better
 compiler."
@@ -2228,20 +3544,20 @@ compiler."
       fi
 
       # Allow error messages only from the first compilation.
-      if test "$suppress_opt" = yes; then
+      if test yes = "$suppress_opt"; then
 	suppress_output=' >/dev/null 2>&1'
       fi
     fi
 
     # Only build a position-dependent object if we build old libraries.
-    if test "$build_old_libs" = yes; then
-      if test "$pic_mode" != yes; then
+    if test yes = "$build_old_libs"; then
+      if test yes != "$pic_mode"; then
 	# Don't build PIC code
 	command="$base_compile $qsrcfile$pie_flag"
       else
 	command="$base_compile $qsrcfile $pic_flag"
       fi
-      if test "$compiler_c_o" = yes; then
+      if test yes = "$compiler_c_o"; then
 	func_append command " -o $obj"
       fi
 
@@ -2250,7 +3566,7 @@ compiler."
       func_show_eval_locale "$command" \
         '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE'
 
-      if test "$need_locks" = warn &&
+      if test warn = "$need_locks" &&
 	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
 	$ECHO "\
 *** ERROR, $lockfile contains:
@@ -2261,7 +3577,7 @@ $srcfile
 
 This indicates that another process is trying to use the same
 temporary object file, and libtool could not work around it because
-your compiler does not support \`-c' and \`-o' together.  If you
+your compiler does not support '-c' and '-o' together.  If you
 repeat this compilation, it may succeed, by chance, but you had better
 avoid parallel builds (make -j) in this platform, or get a better
 compiler."
@@ -2281,7 +3597,7 @@ compiler."
       func_write_libtool_object "$libobj" "$objdir/$objname" "$objname"
 
       # Unlock the critical section if it was locked
-      if test "$need_locks" != no; then
+      if test no != "$need_locks"; then
 	removelist=$lockfile
         $RM "$lockfile"
       fi
@@ -2291,7 +3607,7 @@ compiler."
 }
 
 $opt_help || {
-  test "$opt_mode" = compile && func_mode_compile ${1+"$@"}
+  test compile = "$opt_mode" && func_mode_compile ${1+"$@"}
 }
 
 func_mode_help ()
@@ -2311,7 +3627,7 @@ func_mode_help ()
 Remove files from the build directory.
 
 RM is the name of the program to use to delete files associated with each FILE
-(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+(typically '/bin/rm').  RM-OPTIONS are options (such as '-f') to be passed
 to RM.
 
 If FILE is a libtool library, object or program, all the files associated
@@ -2330,16 +3646,16 @@ This mode accepts the following additional options:
   -no-suppress      do not suppress compiler output for multiple passes
   -prefer-pic       try to build PIC objects only
   -prefer-non-pic   try to build non-PIC objects only
-  -shared           do not build a \`.o' file suitable for static linking
-  -static           only build a \`.o' file suitable for static linking
+  -shared           do not build a '.o' file suitable for static linking
+  -static           only build a '.o' file suitable for static linking
   -Wc,FLAG          pass FLAG directly to the compiler
 
-COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+COMPILE-COMMAND is a command to be used in creating a 'standard' object file
 from the given SOURCEFILE.
 
 The output file name is determined by removing the directory component from
-SOURCEFILE, then substituting the C source code suffix \`.c' with the
-library object suffix, \`.lo'."
+SOURCEFILE, then substituting the C source code suffix '.c' with the
+library object suffix, '.lo'."
         ;;
 
       execute)
@@ -2352,7 +3668,7 @@ This mode accepts the following additional options:
 
   -dlopen FILE      add the directory containing FILE to the library path
 
-This mode sets the library path environment variable according to \`-dlopen'
+This mode sets the library path environment variable according to '-dlopen'
 flags.
 
 If any of the ARGS are libtool executable wrappers, then they are translated
@@ -2371,7 +3687,7 @@ Complete the installation of libtool libraries.
 Each LIBDIR is a directory that contains libtool libraries.
 
 The commands that this mode executes may require superuser privileges.  Use
-the \`--dry-run' option if you just want to see what would be executed."
+the '--dry-run' option if you just want to see what would be executed."
         ;;
 
       install)
@@ -2381,7 +3697,7 @@ the \`--dry-run' option if you just want to see what would be executed."
 Install executables or libraries.
 
 INSTALL-COMMAND is the installation command.  The first component should be
-either the \`install' or \`cp' program.
+either the 'install' or 'cp' program.
 
 The following components of INSTALL-COMMAND are treated specially:
 
@@ -2407,7 +3723,7 @@ The following components of LINK-COMMAND are treated specially:
   -avoid-version    do not add a version suffix if possible
   -bindir BINDIR    specify path to binaries directory (for systems where
                     libraries must be found in the PATH setting at runtime)
-  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
+  -dlopen FILE      '-dlpreopen' FILE if it cannot be dlopened at runtime
   -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
   -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
   -export-symbols SYMFILE
@@ -2421,7 +3737,8 @@ The following components of LINK-COMMAND are treated specially:
   -no-install       link a not-installable executable
   -no-undefined     declare that a library does not refer to external symbols
   -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
-  -objectlist FILE  Use a list of object files found in FILE to specify objects
+  -objectlist FILE  use a list of object files found in FILE to specify objects
+  -os2dllname NAME  force a short DLL name on OS/2 (no effect on other OSes)
   -precious-files-regex REGEX
                     don't remove output files matching REGEX
   -release RELEASE  specify package release information
@@ -2441,20 +3758,20 @@ The following components of LINK-COMMAND are treated specially:
   -Xlinker FLAG     pass linker-specific FLAG directly to the linker
   -XCClinker FLAG   pass link-specific FLAG to the compiler driver (CC)
 
-All other options (arguments beginning with \`-') are ignored.
+All other options (arguments beginning with '-') are ignored.
 
-Every other argument is treated as a filename.  Files ending in \`.la' are
+Every other argument is treated as a filename.  Files ending in '.la' are
 treated as uninstalled libtool libraries, other files are standard or library
 object files.
 
-If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
-only library objects (\`.lo' files) may be specified, and \`-rpath' is
+If the OUTPUT-FILE ends in '.la', then a libtool library is created,
+only library objects ('.lo' files) may be specified, and '-rpath' is
 required, except when creating a convenience library.
 
-If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
-using \`ar' and \`ranlib', or on Windows using \`lib'.
+If OUTPUT-FILE ends in '.a' or '.lib', then a standard library is created
+using 'ar' and 'ranlib', or on Windows using 'lib'.
 
-If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+If OUTPUT-FILE ends in '.lo' or '.$objext', then a reloadable object file
 is created, otherwise an executable program is created."
         ;;
 
@@ -2465,7 +3782,7 @@ is created, otherwise an executable program is created."
 Remove libraries from an installation directory.
 
 RM is the name of the program to use to delete files associated with each FILE
-(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+(typically '/bin/rm').  RM-OPTIONS are options (such as '-f') to be passed
 to RM.
 
 If FILE is a libtool library, all the files associated with it are deleted.
@@ -2473,17 +3790,17 @@ Otherwise, only FILE itself is deleted using RM."
         ;;
 
       *)
-        func_fatal_help "invalid operation mode \`$opt_mode'"
+        func_fatal_help "invalid operation mode '$opt_mode'"
         ;;
     esac
 
     echo
-    $ECHO "Try \`$progname --help' for more information about other modes."
+    $ECHO "Try '$progname --help' for more information about other modes."
 }
 
 # Now that we've collected a possible --mode arg, show help if necessary
 if $opt_help; then
-  if test "$opt_help" = :; then
+  if test : = "$opt_help"; then
     func_mode_help
   else
     {
@@ -2491,7 +3808,7 @@ if $opt_help; then
       for opt_mode in compile link execute install finish uninstall clean; do
 	func_mode_help
       done
-    } | sed -n '1p; 2,$s/^Usage:/  or: /p'
+    } | $SED -n '1p; 2,$s/^Usage:/  or: /p'
     {
       func_help noexit
       for opt_mode in compile link execute install finish uninstall clean; do
@@ -2499,7 +3816,7 @@ if $opt_help; then
 	func_mode_help
       done
     } |
-    sed '1d
+    $SED '1d
       /^When reporting/,/^Report/{
 	H
 	d
@@ -2516,16 +3833,17 @@ fi
 # func_mode_execute arg...
 func_mode_execute ()
 {
-    $opt_debug
+    $debug_cmd
+
     # The first argument is the command name.
-    cmd="$nonopt"
+    cmd=$nonopt
     test -z "$cmd" && \
       func_fatal_help "you must specify a COMMAND"
 
     # Handle -dlopen flags immediately.
     for file in $opt_dlopen; do
       test -f "$file" \
-	|| func_fatal_help "\`$file' is not a file"
+	|| func_fatal_help "'$file' is not a file"
 
       dir=
       case $file in
@@ -2535,7 +3853,7 @@ func_mode_execute ()
 
 	# Check to see that this really is a libtool archive.
 	func_lalib_unsafe_p "$file" \
-	  || func_fatal_help "\`$lib' is not a valid libtool archive"
+	  || func_fatal_help "'$lib' is not a valid libtool archive"
 
 	# Read the libtool library.
 	dlname=
@@ -2546,18 +3864,18 @@ func_mode_execute ()
 	if test -z "$dlname"; then
 	  # Warn if it was a shared library.
 	  test -n "$library_names" && \
-	    func_warning "\`$file' was not linked with \`-export-dynamic'"
+	    func_warning "'$file' was not linked with '-export-dynamic'"
 	  continue
 	fi
 
 	func_dirname "$file" "" "."
-	dir="$func_dirname_result"
+	dir=$func_dirname_result
 
 	if test -f "$dir/$objdir/$dlname"; then
 	  func_append dir "/$objdir"
 	else
 	  if test ! -f "$dir/$dlname"; then
-	    func_fatal_error "cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'"
+	    func_fatal_error "cannot find '$dlname' in '$dir' or '$dir/$objdir'"
 	  fi
 	fi
 	;;
@@ -2565,18 +3883,18 @@ func_mode_execute ()
       *.lo)
 	# Just add the directory containing the .lo file.
 	func_dirname "$file" "" "."
-	dir="$func_dirname_result"
+	dir=$func_dirname_result
 	;;
 
       *)
-	func_warning "\`-dlopen' is ignored for non-libtool libraries and objects"
+	func_warning "'-dlopen' is ignored for non-libtool libraries and objects"
 	continue
 	;;
       esac
 
       # Get the absolute pathname.
       absdir=`cd "$dir" && pwd`
-      test -n "$absdir" && dir="$absdir"
+      test -n "$absdir" && dir=$absdir
 
       # Now add the directory to shlibpath_var.
       if eval "test -z \"\$$shlibpath_var\""; then
@@ -2588,7 +3906,7 @@ func_mode_execute ()
 
     # This variable tells wrapper scripts just to set shlibpath_var
     # rather than running their programs.
-    libtool_execute_magic="$magic"
+    libtool_execute_magic=$magic
 
     # Check if any of the arguments is a wrapper script.
     args=
@@ -2601,12 +3919,12 @@ func_mode_execute ()
 	if func_ltwrapper_script_p "$file"; then
 	  func_source "$file"
 	  # Transform arg to wrapped name.
-	  file="$progdir/$program"
+	  file=$progdir/$program
 	elif func_ltwrapper_executable_p "$file"; then
 	  func_ltwrapper_scriptname "$file"
 	  func_source "$func_ltwrapper_scriptname_result"
 	  # Transform arg to wrapped name.
-	  file="$progdir/$program"
+	  file=$progdir/$program
 	fi
 	;;
       esac
@@ -2614,7 +3932,15 @@ func_mode_execute ()
       func_append_quoted args "$file"
     done
 
-    if test "X$opt_dry_run" = Xfalse; then
+    if $opt_dry_run; then
+      # Display what would be done.
+      if test -n "$shlibpath_var"; then
+	eval "\$ECHO \"\$shlibpath_var=\$$shlibpath_var\""
+	echo "export $shlibpath_var"
+      fi
+      $ECHO "$cmd$args"
+      exit $EXIT_SUCCESS
+    else
       if test -n "$shlibpath_var"; then
 	# Export the shlibpath_var.
 	eval "export $shlibpath_var"
@@ -2631,25 +3957,18 @@ func_mode_execute ()
       done
 
       # Now prepare to actually exec the command.
-      exec_cmd="\$cmd$args"
-    else
-      # Display what would be done.
-      if test -n "$shlibpath_var"; then
-	eval "\$ECHO \"\$shlibpath_var=\$$shlibpath_var\""
-	echo "export $shlibpath_var"
-      fi
-      $ECHO "$cmd$args"
-      exit $EXIT_SUCCESS
+      exec_cmd=\$cmd$args
     fi
 }
 
-test "$opt_mode" = execute && func_mode_execute ${1+"$@"}
+test execute = "$opt_mode" && func_mode_execute ${1+"$@"}
 
 
 # func_mode_finish arg...
 func_mode_finish ()
 {
-    $opt_debug
+    $debug_cmd
+
     libs=
     libdirs=
     admincmds=
@@ -2663,11 +3982,11 @@ func_mode_finish ()
 	if func_lalib_unsafe_p "$opt"; then
 	  func_append libs " $opt"
 	else
-	  func_warning "\`$opt' is not a valid libtool archive"
+	  func_warning "'$opt' is not a valid libtool archive"
 	fi
 
       else
-	func_fatal_error "invalid argument \`$opt'"
+	func_fatal_error "invalid argument '$opt'"
       fi
     done
 
@@ -2682,12 +4001,12 @@ func_mode_finish ()
       # Remove sysroot references
       if $opt_dry_run; then
         for lib in $libs; do
-          echo "removing references to $lt_sysroot and \`=' prefixes from $lib"
+          echo "removing references to $lt_sysroot and '=' prefixes from $lib"
         done
       else
         tmpdir=`func_mktempdir`
         for lib in $libs; do
-	  sed -e "${sysroot_cmd} s/\([ ']-[LR]\)=/\1/g; s/\([ ']\)=/\1/g" $lib \
+	  $SED -e "$sysroot_cmd s/\([ ']-[LR]\)=/\1/g; s/\([ ']\)=/\1/g" $lib \
 	    > $tmpdir/tmp-la
 	  mv -f $tmpdir/tmp-la $lib
 	done
@@ -2712,7 +4031,7 @@ func_mode_finish ()
     fi
 
     # Exit here if they wanted silent mode.
-    $opt_silent && exit $EXIT_SUCCESS
+    $opt_quiet && exit $EXIT_SUCCESS
 
     if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
       echo "----------------------------------------------------------------------"
@@ -2723,27 +4042,27 @@ func_mode_finish ()
       echo
       echo "If you ever happen to want to link against installed libraries"
       echo "in a given directory, LIBDIR, you must either use libtool, and"
-      echo "specify the full pathname of the library, or use the \`-LLIBDIR'"
+      echo "specify the full pathname of the library, or use the '-LLIBDIR'"
       echo "flag during linking and do at least one of the following:"
       if test -n "$shlibpath_var"; then
-	echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
+	echo "   - add LIBDIR to the '$shlibpath_var' environment variable"
 	echo "     during execution"
       fi
       if test -n "$runpath_var"; then
-	echo "   - add LIBDIR to the \`$runpath_var' environment variable"
+	echo "   - add LIBDIR to the '$runpath_var' environment variable"
 	echo "     during linking"
       fi
       if test -n "$hardcode_libdir_flag_spec"; then
 	libdir=LIBDIR
 	eval flag=\"$hardcode_libdir_flag_spec\"
 
-	$ECHO "   - use the \`$flag' linker flag"
+	$ECHO "   - use the '$flag' linker flag"
       fi
       if test -n "$admincmds"; then
 	$ECHO "   - have your system administrator run these commands:$admincmds"
       fi
       if test -f /etc/ld.so.conf; then
-	echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+	echo "   - have your system administrator add LIBDIR to '/etc/ld.so.conf'"
       fi
       echo
 
@@ -2762,18 +4081,20 @@ func_mode_finish ()
     exit $EXIT_SUCCESS
 }
 
-test "$opt_mode" = finish && func_mode_finish ${1+"$@"}
+test finish = "$opt_mode" && func_mode_finish ${1+"$@"}
 
 
 # func_mode_install arg...
 func_mode_install ()
 {
-    $opt_debug
+    $debug_cmd
+
     # There may be an optional sh(1) argument at the beginning of
     # install_prog (especially on Windows NT).
-    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh ||
+    if test "$SHELL" = "$nonopt" || test /bin/sh = "$nonopt" ||
        # Allow the use of GNU shtool's install command.
-       case $nonopt in *shtool*) :;; *) false;; esac; then
+       case $nonopt in *shtool*) :;; *) false;; esac
+    then
       # Aesthetically quote it.
       func_quote_for_eval "$nonopt"
       install_prog="$func_quote_for_eval_result "
@@ -2800,7 +4121,7 @@ func_mode_install ()
     opts=
     prev=
     install_type=
-    isdir=no
+    isdir=false
     stripme=
     no_mode=:
     for arg
@@ -2813,7 +4134,7 @@ func_mode_install ()
       fi
 
       case $arg in
-      -d) isdir=yes ;;
+      -d) isdir=: ;;
       -f)
 	if $install_cp; then :; else
 	  prev=$arg
@@ -2831,7 +4152,7 @@ func_mode_install ()
       *)
 	# If the previous option needed an argument, then skip it.
 	if test -n "$prev"; then
-	  if test "x$prev" = x-m && test -n "$install_override_mode"; then
+	  if test X-m = "X$prev" && test -n "$install_override_mode"; then
 	    arg2=$install_override_mode
 	    no_mode=false
 	  fi
@@ -2856,7 +4177,7 @@ func_mode_install ()
       func_fatal_help "you must specify an install program"
 
     test -n "$prev" && \
-      func_fatal_help "the \`$prev' option requires an argument"
+      func_fatal_help "the '$prev' option requires an argument"
 
     if test -n "$install_override_mode" && $no_mode; then
       if $install_cp; then :; else
@@ -2878,19 +4199,19 @@ func_mode_install ()
     dest=$func_stripname_result
 
     # Check to see that the destination is a directory.
-    test -d "$dest" && isdir=yes
-    if test "$isdir" = yes; then
-      destdir="$dest"
+    test -d "$dest" && isdir=:
+    if $isdir; then
+      destdir=$dest
       destname=
     else
       func_dirname_and_basename "$dest" "" "."
-      destdir="$func_dirname_result"
-      destname="$func_basename_result"
+      destdir=$func_dirname_result
+      destname=$func_basename_result
 
       # Not a directory, so check to see that there is only one file specified.
       set dummy $files; shift
       test "$#" -gt 1 && \
-	func_fatal_help "\`$dest' is not a directory"
+	func_fatal_help "'$dest' is not a directory"
     fi
     case $destdir in
     [\\/]* | [A-Za-z]:[\\/]*) ;;
@@ -2899,7 +4220,7 @@ func_mode_install ()
 	case $file in
 	*.lo) ;;
 	*)
-	  func_fatal_help "\`$destdir' must be an absolute directory name"
+	  func_fatal_help "'$destdir' must be an absolute directory name"
 	  ;;
 	esac
       done
@@ -2908,7 +4229,7 @@ func_mode_install ()
 
     # This variable tells wrapper scripts just to set variables rather
     # than running their programs.
-    libtool_install_magic="$magic"
+    libtool_install_magic=$magic
 
     staticlibs=
     future_libdirs=
@@ -2928,7 +4249,7 @@ func_mode_install ()
 
 	# Check to see that this really is a libtool archive.
 	func_lalib_unsafe_p "$file" \
-	  || func_fatal_help "\`$file' is not a valid libtool archive"
+	  || func_fatal_help "'$file' is not a valid libtool archive"
 
 	library_names=
 	old_library=
@@ -2950,7 +4271,7 @@ func_mode_install ()
 	fi
 
 	func_dirname "$file" "/" ""
-	dir="$func_dirname_result"
+	dir=$func_dirname_result
 	func_append dir "$objdir"
 
 	if test -n "$relink_command"; then
@@ -2964,7 +4285,7 @@ func_mode_install ()
 	  # are installed into $libdir/../bin (currently, that works fine)
 	  # but it's something to keep an eye on.
 	  test "$inst_prefix_dir" = "$destdir" && \
-	    func_fatal_error "error: cannot install \`$file' to a directory not ending in $libdir"
+	    func_fatal_error "error: cannot install '$file' to a directory not ending in $libdir"
 
 	  if test -n "$inst_prefix_dir"; then
 	    # Stick the inst_prefix_dir data into the link command.
@@ -2973,29 +4294,36 @@ func_mode_install ()
 	    relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%%"`
 	  fi
 
-	  func_warning "relinking \`$file'"
+	  func_warning "relinking '$file'"
 	  func_show_eval "$relink_command" \
-	    'func_fatal_error "error: relink \`$file'\'' with the above command before installing it"'
+	    'func_fatal_error "error: relink '\''$file'\'' with the above command before installing it"'
 	fi
 
 	# See the names of the shared library.
 	set dummy $library_names; shift
 	if test -n "$1"; then
-	  realname="$1"
+	  realname=$1
 	  shift
 
-	  srcname="$realname"
-	  test -n "$relink_command" && srcname="$realname"T
+	  srcname=$realname
+	  test -n "$relink_command" && srcname=${realname}T
 
 	  # Install the shared library and build the symlinks.
 	  func_show_eval "$install_shared_prog $dir/$srcname $destdir/$realname" \
 	      'exit $?'
-	  tstripme="$stripme"
+	  tstripme=$stripme
 	  case $host_os in
 	  cygwin* | mingw* | pw32* | cegcc*)
 	    case $realname in
 	    *.dll.a)
-	      tstripme=""
+	      tstripme=
+	      ;;
+	    esac
+	    ;;
+	  os2*)
+	    case $realname in
+	    *_dll.a)
+	      tstripme=
 	      ;;
 	    esac
 	    ;;
@@ -3006,7 +4334,7 @@ func_mode_install ()
 
 	  if test "$#" -gt 0; then
 	    # Delete the old symlinks, and create new ones.
-	    # Try `ln -sf' first, because the `ln' binary might depend on
+	    # Try 'ln -sf' first, because the 'ln' binary might depend on
 	    # the symlink we replace!  Solaris /bin/ln does not understand -f,
 	    # so we also need to try rm && ln -s.
 	    for linkname
@@ -3017,14 +4345,14 @@ func_mode_install ()
 	  fi
 
 	  # Do each command in the postinstall commands.
-	  lib="$destdir/$realname"
+	  lib=$destdir/$realname
 	  func_execute_cmds "$postinstall_cmds" 'exit $?'
 	fi
 
 	# Install the pseudo-library for information purposes.
 	func_basename "$file"
-	name="$func_basename_result"
-	instname="$dir/$name"i
+	name=$func_basename_result
+	instname=$dir/${name}i
 	func_show_eval "$install_prog $instname $destdir/$name" 'exit $?'
 
 	# Maybe install the static library, too.
@@ -3036,11 +4364,11 @@ func_mode_install ()
 
 	# Figure out destination file name, if it wasn't already specified.
 	if test -n "$destname"; then
-	  destfile="$destdir/$destname"
+	  destfile=$destdir/$destname
 	else
 	  func_basename "$file"
-	  destfile="$func_basename_result"
-	  destfile="$destdir/$destfile"
+	  destfile=$func_basename_result
+	  destfile=$destdir/$destfile
 	fi
 
 	# Deduce the name of the destination old-style object file.
@@ -3050,11 +4378,11 @@ func_mode_install ()
 	  staticdest=$func_lo2o_result
 	  ;;
 	*.$objext)
-	  staticdest="$destfile"
+	  staticdest=$destfile
 	  destfile=
 	  ;;
 	*)
-	  func_fatal_help "cannot copy a libtool object to \`$destfile'"
+	  func_fatal_help "cannot copy a libtool object to '$destfile'"
 	  ;;
 	esac
 
@@ -3063,7 +4391,7 @@ func_mode_install ()
 	  func_show_eval "$install_prog $file $destfile" 'exit $?'
 
 	# Install the old object if enabled.
-	if test "$build_old_libs" = yes; then
+	if test yes = "$build_old_libs"; then
 	  # Deduce the name of the old-style object file.
 	  func_lo2o "$file"
 	  staticobj=$func_lo2o_result
@@ -3075,23 +4403,23 @@ func_mode_install ()
       *)
 	# Figure out destination file name, if it wasn't already specified.
 	if test -n "$destname"; then
-	  destfile="$destdir/$destname"
+	  destfile=$destdir/$destname
 	else
 	  func_basename "$file"
-	  destfile="$func_basename_result"
-	  destfile="$destdir/$destfile"
+	  destfile=$func_basename_result
+	  destfile=$destdir/$destfile
 	fi
 
 	# If the file is missing, and there is a .exe on the end, strip it
 	# because it is most likely a libtool script we actually want to
 	# install
-	stripped_ext=""
+	stripped_ext=
 	case $file in
 	  *.exe)
 	    if test ! -f "$file"; then
 	      func_stripname '' '.exe' "$file"
 	      file=$func_stripname_result
-	      stripped_ext=".exe"
+	      stripped_ext=.exe
 	    fi
 	    ;;
 	esac
@@ -3119,19 +4447,19 @@ func_mode_install ()
 
 	  # Check the variables that should have been set.
 	  test -z "$generated_by_libtool_version" && \
-	    func_fatal_error "invalid libtool wrapper script \`$wrapper'"
+	    func_fatal_error "invalid libtool wrapper script '$wrapper'"
 
-	  finalize=yes
+	  finalize=:
 	  for lib in $notinst_deplibs; do
 	    # Check to see that each library is installed.
 	    libdir=
 	    if test -f "$lib"; then
 	      func_source "$lib"
 	    fi
-	    libfile="$libdir/"`$ECHO "$lib" | $SED 's%^.*/%%g'` ### testsuite: skip nested quoting test
+	    libfile=$libdir/`$ECHO "$lib" | $SED 's%^.*/%%g'`
 	    if test -n "$libdir" && test ! -f "$libfile"; then
-	      func_warning "\`$lib' has not been installed in \`$libdir'"
-	      finalize=no
+	      func_warning "'$lib' has not been installed in '$libdir'"
+	      finalize=false
 	    fi
 	  done
 
@@ -3139,29 +4467,29 @@ func_mode_install ()
 	  func_source "$wrapper"
 
 	  outputname=
-	  if test "$fast_install" = no && test -n "$relink_command"; then
+	  if test no = "$fast_install" && test -n "$relink_command"; then
 	    $opt_dry_run || {
-	      if test "$finalize" = yes; then
+	      if $finalize; then
 	        tmpdir=`func_mktempdir`
 		func_basename "$file$stripped_ext"
-		file="$func_basename_result"
-	        outputname="$tmpdir/$file"
+		file=$func_basename_result
+	        outputname=$tmpdir/$file
 	        # Replace the output file specification.
 	        relink_command=`$ECHO "$relink_command" | $SED 's%@OUTPUT@%'"$outputname"'%g'`
 
-	        $opt_silent || {
+	        $opt_quiet || {
 	          func_quote_for_expand "$relink_command"
 		  eval "func_echo $func_quote_for_expand_result"
 	        }
 	        if eval "$relink_command"; then :
 	          else
-		  func_error "error: relink \`$file' with the above command before installing it"
+		  func_error "error: relink '$file' with the above command before installing it"
 		  $opt_dry_run || ${RM}r "$tmpdir"
 		  continue
 	        fi
-	        file="$outputname"
+	        file=$outputname
 	      else
-	        func_warning "cannot relink \`$file'"
+	        func_warning "cannot relink '$file'"
 	      fi
 	    }
 	  else
@@ -3198,10 +4526,10 @@ func_mode_install ()
 
     for file in $staticlibs; do
       func_basename "$file"
-      name="$func_basename_result"
+      name=$func_basename_result
 
       # Set up the ranlib parameters.
-      oldlib="$destdir/$name"
+      oldlib=$destdir/$name
       func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
       tool_oldlib=$func_to_tool_file_result
 
@@ -3216,18 +4544,18 @@ func_mode_install ()
     done
 
     test -n "$future_libdirs" && \
-      func_warning "remember to run \`$progname --finish$future_libdirs'"
+      func_warning "remember to run '$progname --finish$future_libdirs'"
 
     if test -n "$current_libdirs"; then
       # Maybe just do a dry run.
       $opt_dry_run && current_libdirs=" -n$current_libdirs"
-      exec_cmd='$SHELL $progpath $preserve_args --finish$current_libdirs'
+      exec_cmd='$SHELL "$progpath" $preserve_args --finish$current_libdirs'
     else
       exit $EXIT_SUCCESS
     fi
 }
 
-test "$opt_mode" = install && func_mode_install ${1+"$@"}
+test install = "$opt_mode" && func_mode_install ${1+"$@"}
 
 
 # func_generate_dlsyms outputname originator pic_p
@@ -3235,16 +4563,17 @@ test "$opt_mode" = install && func_mode_install ${1+"$@"}
 # a dlpreopen symbol table.
 func_generate_dlsyms ()
 {
-    $opt_debug
-    my_outputname="$1"
-    my_originator="$2"
-    my_pic_p="${3-no}"
-    my_prefix=`$ECHO "$my_originator" | sed 's%[^a-zA-Z0-9]%_%g'`
+    $debug_cmd
+
+    my_outputname=$1
+    my_originator=$2
+    my_pic_p=${3-false}
+    my_prefix=`$ECHO "$my_originator" | $SED 's%[^a-zA-Z0-9]%_%g'`
     my_dlsyms=
 
-    if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+    if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
       if test -n "$NM" && test -n "$global_symbol_pipe"; then
-	my_dlsyms="${my_outputname}S.c"
+	my_dlsyms=${my_outputname}S.c
       else
 	func_error "not configured to extract global symbols from dlpreopened files"
       fi
@@ -3255,7 +4584,7 @@ func_generate_dlsyms ()
       "") ;;
       *.c)
 	# Discover the nlist of each of the dlfiles.
-	nlist="$output_objdir/${my_outputname}.nm"
+	nlist=$output_objdir/$my_outputname.nm
 
 	func_show_eval "$RM $nlist ${nlist}S ${nlist}T"
 
@@ -3263,34 +4592,36 @@ func_generate_dlsyms ()
 	func_verbose "creating $output_objdir/$my_dlsyms"
 
 	$opt_dry_run || $ECHO > "$output_objdir/$my_dlsyms" "\
-/* $my_dlsyms - symbol resolution table for \`$my_outputname' dlsym emulation. */
-/* Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION */
+/* $my_dlsyms - symbol resolution table for '$my_outputname' dlsym emulation. */
+/* Generated by $PROGRAM (GNU $PACKAGE) $VERSION */
 
 #ifdef __cplusplus
 extern \"C\" {
 #endif
 
-#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4))
+#if defined __GNUC__ && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4))
 #pragma GCC diagnostic ignored \"-Wstrict-prototypes\"
 #endif
 
 /* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
-#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
-/* DATA imports from DLLs on WIN32 con't be const, because runtime
+#if defined _WIN32 || defined __CYGWIN__ || defined _WIN32_WCE
+/* DATA imports from DLLs on WIN32 can't be const, because runtime
    relocations are performed -- see ld's documentation on pseudo-relocs.  */
 # define LT_DLSYM_CONST
-#elif defined(__osf__)
+#elif defined __osf__
 /* This system does not cope well with relocations in const data.  */
 # define LT_DLSYM_CONST
 #else
 # define LT_DLSYM_CONST const
 #endif
 
+#define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0)
+
 /* External symbol declarations for the compiler. */\
 "
 
-	if test "$dlself" = yes; then
-	  func_verbose "generating symbol list for \`$output'"
+	if test yes = "$dlself"; then
+	  func_verbose "generating symbol list for '$output'"
 
 	  $opt_dry_run || echo ': @PROGRAM@ ' > "$nlist"
 
@@ -3298,7 +4629,7 @@ extern \"C\" {
 	  progfiles=`$ECHO "$objs$old_deplibs" | $SP2NL | $SED "$lo2o" | $NL2SP`
 	  for progfile in $progfiles; do
 	    func_to_tool_file "$progfile" func_convert_file_msys_to_w32
-	    func_verbose "extracting global C symbols from \`$func_to_tool_file_result'"
+	    func_verbose "extracting global C symbols from '$func_to_tool_file_result'"
 	    $opt_dry_run || eval "$NM $func_to_tool_file_result | $global_symbol_pipe >> '$nlist'"
 	  done
 
@@ -3318,10 +4649,10 @@ extern \"C\" {
 
 	  # Prepare the list of exported symbols
 	  if test -z "$export_symbols"; then
-	    export_symbols="$output_objdir/$outputname.exp"
+	    export_symbols=$output_objdir/$outputname.exp
 	    $opt_dry_run || {
 	      $RM $export_symbols
-	      eval "${SED} -n -e '/^: @PROGRAM@ $/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+	      eval "$SED -n -e '/^: @PROGRAM@ $/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
 	      case $host in
 	      *cygwin* | *mingw* | *cegcc* )
                 eval "echo EXPORTS "'> "$output_objdir/$outputname.def"'
@@ -3331,7 +4662,7 @@ extern \"C\" {
 	    }
 	  else
 	    $opt_dry_run || {
-	      eval "${SED} -e 's/\([].[*^$]\)/\\\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$outputname.exp"'
+	      eval "$SED -e 's/\([].[*^$]\)/\\\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$outputname.exp"'
 	      eval '$GREP -f "$output_objdir/$outputname.exp" < "$nlist" > "$nlist"T'
 	      eval '$MV "$nlist"T "$nlist"'
 	      case $host in
@@ -3345,22 +4676,22 @@ extern \"C\" {
 	fi
 
 	for dlprefile in $dlprefiles; do
-	  func_verbose "extracting global C symbols from \`$dlprefile'"
+	  func_verbose "extracting global C symbols from '$dlprefile'"
 	  func_basename "$dlprefile"
-	  name="$func_basename_result"
+	  name=$func_basename_result
           case $host in
 	    *cygwin* | *mingw* | *cegcc* )
 	      # if an import library, we need to obtain dlname
 	      if func_win32_import_lib_p "$dlprefile"; then
 	        func_tr_sh "$dlprefile"
 	        eval "curr_lafile=\$libfile_$func_tr_sh_result"
-	        dlprefile_dlbasename=""
+	        dlprefile_dlbasename=
 	        if test -n "$curr_lafile" && func_lalib_p "$curr_lafile"; then
 	          # Use subshell, to avoid clobbering current variable values
 	          dlprefile_dlname=`source "$curr_lafile" && echo "$dlname"`
-	          if test -n "$dlprefile_dlname" ; then
+	          if test -n "$dlprefile_dlname"; then
 	            func_basename "$dlprefile_dlname"
-	            dlprefile_dlbasename="$func_basename_result"
+	            dlprefile_dlbasename=$func_basename_result
 	          else
 	            # no lafile. user explicitly requested -dlpreopen <import library>.
 	            $sharedlib_from_linklib_cmd "$dlprefile"
@@ -3368,7 +4699,7 @@ extern \"C\" {
 	          fi
 	        fi
 	        $opt_dry_run || {
-	          if test -n "$dlprefile_dlbasename" ; then
+	          if test -n "$dlprefile_dlbasename"; then
 	            eval '$ECHO ": $dlprefile_dlbasename" >> "$nlist"'
 	          else
 	            func_warning "Could not compute DLL name from $name"
@@ -3424,6 +4755,11 @@ extern \"C\" {
 	    echo '/* NONE */' >> "$output_objdir/$my_dlsyms"
 	  fi
 
+	  func_show_eval '$RM "${nlist}I"'
+	  if test -n "$global_symbol_to_import"; then
+	    eval "$global_symbol_to_import"' < "$nlist"S > "$nlist"I'
+	  fi
+
 	  echo >> "$output_objdir/$my_dlsyms" "\
 
 /* The mapping between symbol names and symbols.  */
@@ -3432,11 +4768,30 @@ typedef struct {
   void *address;
 } lt_dlsymlist;
 extern LT_DLSYM_CONST lt_dlsymlist
-lt_${my_prefix}_LTX_preloaded_symbols[];
+lt_${my_prefix}_LTX_preloaded_symbols[];\
+"
+
+	  if test -s "$nlist"I; then
+	    echo >> "$output_objdir/$my_dlsyms" "\
+static void lt_syminit(void)
+{
+  LT_DLSYM_CONST lt_dlsymlist *symbol = lt_${my_prefix}_LTX_preloaded_symbols;
+  for (; symbol->name; ++symbol)
+    {"
+	    $SED 's/.*/      if (STREQ (symbol->name, \"&\")) symbol->address = (void *) \&&;/' < "$nlist"I >> "$output_objdir/$my_dlsyms"
+	    echo >> "$output_objdir/$my_dlsyms" "\
+    }
+}"
+	  fi
+	  echo >> "$output_objdir/$my_dlsyms" "\
 LT_DLSYM_CONST lt_dlsymlist
 lt_${my_prefix}_LTX_preloaded_symbols[] =
-{\
-  { \"$my_originator\", (void *) 0 },"
+{ {\"$my_originator\", (void *) 0},"
+
+	  if test -s "$nlist"I; then
+	    echo >> "$output_objdir/$my_dlsyms" "\
+  {\"@INIT@\", (void *) &lt_syminit},"
+	  fi
 
 	  case $need_lib_prefix in
 	  no)
@@ -3478,9 +4833,7 @@ static const void *lt_preloaded_setup() {
 	  *-*-hpux*)
 	    pic_flag_for_symtable=" $pic_flag"  ;;
 	  *)
-	    if test "X$my_pic_p" != Xno; then
-	      pic_flag_for_symtable=" $pic_flag"
-	    fi
+	    $my_pic_p && pic_flag_for_symtable=" $pic_flag"
 	    ;;
 	  esac
 	  ;;
@@ -3497,10 +4850,10 @@ static const void *lt_preloaded_setup() {
 	func_show_eval '(cd $output_objdir && $LTCC$symtab_cflags -c$no_builtin_flag$pic_flag_for_symtable "$my_dlsyms")' 'exit $?'
 
 	# Clean up the generated files.
-	func_show_eval '$RM "$output_objdir/$my_dlsyms" "$nlist" "${nlist}S" "${nlist}T"'
+	func_show_eval '$RM "$output_objdir/$my_dlsyms" "$nlist" "${nlist}S" "${nlist}T" "${nlist}I"'
 
 	# Transform the symbol file into the correct name.
-	symfileobj="$output_objdir/${my_outputname}S.$objext"
+	symfileobj=$output_objdir/${my_outputname}S.$objext
 	case $host in
 	*cygwin* | *mingw* | *cegcc* )
 	  if test -f "$output_objdir/$my_outputname.def"; then
@@ -3518,7 +4871,7 @@ static const void *lt_preloaded_setup() {
 	esac
 	;;
       *)
-	func_fatal_error "unknown suffix for \`$my_dlsyms'"
+	func_fatal_error "unknown suffix for '$my_dlsyms'"
 	;;
       esac
     else
@@ -3532,6 +4885,32 @@ static const void *lt_preloaded_setup() {
     fi
 }
 
+# func_cygming_gnu_implib_p ARG
+# This predicate returns with zero status (TRUE) if
+# ARG is a GNU/binutils-style import library. Returns
+# with nonzero status (FALSE) otherwise.
+func_cygming_gnu_implib_p ()
+{
+  $debug_cmd
+
+  func_to_tool_file "$1" func_convert_file_msys_to_w32
+  func_cygming_gnu_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $EGREP ' (_head_[A-Za-z0-9_]+_[ad]l*|[A-Za-z0-9_]+_[ad]l*_iname)$'`
+  test -n "$func_cygming_gnu_implib_tmp"
+}
+
+# func_cygming_ms_implib_p ARG
+# This predicate returns with zero status (TRUE) if
+# ARG is an MS-style import library. Returns
+# with nonzero status (FALSE) otherwise.
+func_cygming_ms_implib_p ()
+{
+  $debug_cmd
+
+  func_to_tool_file "$1" func_convert_file_msys_to_w32
+  func_cygming_ms_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $GREP '_NULL_IMPORT_DESCRIPTOR'`
+  test -n "$func_cygming_ms_implib_tmp"
+}
+
 # func_win32_libid arg
 # return the library type of file 'arg'
 #
@@ -3541,8 +4920,9 @@ static const void *lt_preloaded_setup() {
 # Despite the name, also deal with 64 bit binaries.
 func_win32_libid ()
 {
-  $opt_debug
-  win32_libid_type="unknown"
+  $debug_cmd
+
+  win32_libid_type=unknown
   win32_fileres=`file -L $1 2>/dev/null`
   case $win32_fileres in
   *ar\ archive\ import\ library*) # definitely import
@@ -3552,16 +4932,29 @@ func_win32_libid ()
     # Keep the egrep pattern in sync with the one in _LT_CHECK_MAGIC_METHOD.
     if eval $OBJDUMP -f $1 | $SED -e '10q' 2>/dev/null |
        $EGREP 'file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)' >/dev/null; then
-      func_to_tool_file "$1" func_convert_file_msys_to_w32
-      win32_nmres=`eval $NM -f posix -A \"$func_to_tool_file_result\" |
-	$SED -n -e '
+      case $nm_interface in
+      "MS dumpbin")
+	if func_cygming_ms_implib_p "$1" ||
+	   func_cygming_gnu_implib_p "$1"
+	then
+	  win32_nmres=import
+	else
+	  win32_nmres=
+	fi
+	;;
+      *)
+	func_to_tool_file "$1" func_convert_file_msys_to_w32
+	win32_nmres=`eval $NM -f posix -A \"$func_to_tool_file_result\" |
+	  $SED -n -e '
 	    1,100{
 		/ I /{
-		    s,.*,import,
+		    s|.*|import|
 		    p
 		    q
 		}
 	    }'`
+	;;
+      esac
       case $win32_nmres in
       import*)  win32_libid_type="x86 archive import";;
       *)        win32_libid_type="x86 archive static";;
@@ -3593,7 +4986,8 @@ func_win32_libid ()
 #    $sharedlib_from_linklib_result
 func_cygming_dll_for_implib ()
 {
-  $opt_debug
+  $debug_cmd
+
   sharedlib_from_linklib_result=`$DLLTOOL --identify-strict --identify "$1"`
 }
 
@@ -3610,7 +5004,8 @@ func_cygming_dll_for_implib ()
 # specified import library.
 func_cygming_dll_for_implib_fallback_core ()
 {
-  $opt_debug
+  $debug_cmd
+
   match_literal=`$ECHO "$1" | $SED "$sed_make_literal_regex"`
   $OBJDUMP -s --section "$1" "$2" 2>/dev/null |
     $SED '/^Contents of section '"$match_literal"':/{
@@ -3646,8 +5041,8 @@ func_cygming_dll_for_implib_fallback_core ()
       /./p' |
     # we now have a list, one entry per line, of the stringified
     # contents of the appropriate section of all members of the
-    # archive which possess that section. Heuristic: eliminate
-    # all those which have a first or second character that is
+    # archive that possess that section. Heuristic: eliminate
+    # all those that have a first or second character that is
     # a '.' (that is, objdump's representation of an unprintable
     # character.) This should work for all archives with less than
     # 0x302f exports -- but will fail for DLLs whose name actually
@@ -3658,30 +5053,6 @@ func_cygming_dll_for_implib_fallback_core ()
     $SED -e '/^\./d;/^.\./d;q'
 }
 
-# func_cygming_gnu_implib_p ARG
-# This predicate returns with zero status (TRUE) if
-# ARG is a GNU/binutils-style import library. Returns
-# with nonzero status (FALSE) otherwise.
-func_cygming_gnu_implib_p ()
-{
-  $opt_debug
-  func_to_tool_file "$1" func_convert_file_msys_to_w32
-  func_cygming_gnu_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $EGREP ' (_head_[A-Za-z0-9_]+_[ad]l*|[A-Za-z0-9_]+_[ad]l*_iname)$'`
-  test -n "$func_cygming_gnu_implib_tmp"
-}
-
-# func_cygming_ms_implib_p ARG
-# This predicate returns with zero status (TRUE) if
-# ARG is an MS-style import library. Returns
-# with nonzero status (FALSE) otherwise.
-func_cygming_ms_implib_p ()
-{
-  $opt_debug
-  func_to_tool_file "$1" func_convert_file_msys_to_w32
-  func_cygming_ms_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $GREP '_NULL_IMPORT_DESCRIPTOR'`
-  test -n "$func_cygming_ms_implib_tmp"
-}
-
 # func_cygming_dll_for_implib_fallback ARG
 # Platform-specific function to extract the
 # name of the DLL associated with the specified
@@ -3695,16 +5066,17 @@ func_cygming_ms_implib_p ()
 #    $sharedlib_from_linklib_result
 func_cygming_dll_for_implib_fallback ()
 {
-  $opt_debug
-  if func_cygming_gnu_implib_p "$1" ; then
+  $debug_cmd
+
+  if func_cygming_gnu_implib_p "$1"; then
     # binutils import library
     sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$7' "$1"`
-  elif func_cygming_ms_implib_p "$1" ; then
+  elif func_cygming_ms_implib_p "$1"; then
     # ms-generated import library
     sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$6' "$1"`
   else
     # unknown
-    sharedlib_from_linklib_result=""
+    sharedlib_from_linklib_result=
   fi
 }
 
@@ -3712,10 +5084,11 @@ func_cygming_dll_for_implib_fallback ()
 # func_extract_an_archive dir oldlib
 func_extract_an_archive ()
 {
-    $opt_debug
-    f_ex_an_ar_dir="$1"; shift
-    f_ex_an_ar_oldlib="$1"
-    if test "$lock_old_archive_extraction" = yes; then
+    $debug_cmd
+
+    f_ex_an_ar_dir=$1; shift
+    f_ex_an_ar_oldlib=$1
+    if test yes = "$lock_old_archive_extraction"; then
       lockfile=$f_ex_an_ar_oldlib.lock
       until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
 	func_echo "Waiting for $lockfile to be removed"
@@ -3724,7 +5097,7 @@ func_extract_an_archive ()
     fi
     func_show_eval "(cd \$f_ex_an_ar_dir && $AR x \"\$f_ex_an_ar_oldlib\")" \
 		   'stat=$?; rm -f "$lockfile"; exit $stat'
-    if test "$lock_old_archive_extraction" = yes; then
+    if test yes = "$lock_old_archive_extraction"; then
       $opt_dry_run || rm -f "$lockfile"
     fi
     if ($AR t "$f_ex_an_ar_oldlib" | sort | sort -uc >/dev/null 2>&1); then
@@ -3738,22 +5111,23 @@ func_extract_an_archive ()
 # func_extract_archives gentop oldlib ...
 func_extract_archives ()
 {
-    $opt_debug
-    my_gentop="$1"; shift
+    $debug_cmd
+
+    my_gentop=$1; shift
     my_oldlibs=${1+"$@"}
-    my_oldobjs=""
-    my_xlib=""
-    my_xabs=""
-    my_xdir=""
+    my_oldobjs=
+    my_xlib=
+    my_xabs=
+    my_xdir=
 
     for my_xlib in $my_oldlibs; do
       # Extract the objects.
       case $my_xlib in
-	[\\/]* | [A-Za-z]:[\\/]*) my_xabs="$my_xlib" ;;
+	[\\/]* | [A-Za-z]:[\\/]*) my_xabs=$my_xlib ;;
 	*) my_xabs=`pwd`"/$my_xlib" ;;
       esac
       func_basename "$my_xlib"
-      my_xlib="$func_basename_result"
+      my_xlib=$func_basename_result
       my_xlib_u=$my_xlib
       while :; do
         case " $extracted_archives " in
@@ -3765,7 +5139,7 @@ func_extract_archives ()
 	esac
       done
       extracted_archives="$extracted_archives $my_xlib_u"
-      my_xdir="$my_gentop/$my_xlib_u"
+      my_xdir=$my_gentop/$my_xlib_u
 
       func_mkdir_p "$my_xdir"
 
@@ -3778,22 +5152,23 @@ func_extract_archives ()
 	  cd $my_xdir || exit $?
 	  darwin_archive=$my_xabs
 	  darwin_curdir=`pwd`
-	  darwin_base_archive=`basename "$darwin_archive"`
+	  func_basename "$darwin_archive"
+	  darwin_base_archive=$func_basename_result
 	  darwin_arches=`$LIPO -info "$darwin_archive" 2>/dev/null | $GREP Architectures 2>/dev/null || true`
 	  if test -n "$darwin_arches"; then
 	    darwin_arches=`$ECHO "$darwin_arches" | $SED -e 's/.*are://'`
 	    darwin_arch=
 	    func_verbose "$darwin_base_archive has multiple architectures $darwin_arches"
-	    for darwin_arch in  $darwin_arches ; do
-	      func_mkdir_p "unfat-$$/${darwin_base_archive}-${darwin_arch}"
-	      $LIPO -thin $darwin_arch -output "unfat-$$/${darwin_base_archive}-${darwin_arch}/${darwin_base_archive}" "${darwin_archive}"
-	      cd "unfat-$$/${darwin_base_archive}-${darwin_arch}"
-	      func_extract_an_archive "`pwd`" "${darwin_base_archive}"
+	    for darwin_arch in  $darwin_arches; do
+	      func_mkdir_p "unfat-$$/$darwin_base_archive-$darwin_arch"
+	      $LIPO -thin $darwin_arch -output "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive" "$darwin_archive"
+	      cd "unfat-$$/$darwin_base_archive-$darwin_arch"
+	      func_extract_an_archive "`pwd`" "$darwin_base_archive"
 	      cd "$darwin_curdir"
-	      $RM "unfat-$$/${darwin_base_archive}-${darwin_arch}/${darwin_base_archive}"
+	      $RM "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive"
 	    done # $darwin_arches
             ## Okay now we've a bunch of thin objects, gotta fatten them up :)
-	    darwin_filelist=`find unfat-$$ -type f -name \*.o -print -o -name \*.lo -print | $SED -e "$basename" | sort -u`
+	    darwin_filelist=`find unfat-$$ -type f -name \*.o -print -o -name \*.lo -print | $SED -e "$sed_basename" | sort -u`
 	    darwin_file=
 	    darwin_files=
 	    for darwin_file in $darwin_filelist; do
@@ -3815,7 +5190,7 @@ func_extract_archives ()
       my_oldobjs="$my_oldobjs "`find $my_xdir -name \*.$objext -print -o -name \*.lo -print | sort | $NL2SP`
     done
 
-    func_extract_archives_result="$my_oldobjs"
+    func_extract_archives_result=$my_oldobjs
 }
 
 
@@ -3830,7 +5205,7 @@ func_extract_archives ()
 #
 # ARG is the value that the WRAPPER_SCRIPT_BELONGS_IN_OBJDIR
 # variable will take.  If 'yes', then the emitted script
-# will assume that the directory in which it is stored is
+# will assume that the directory where it is stored is
 # the $objdir directory.  This is a cygwin/mingw-specific
 # behavior.
 func_emit_wrapper ()
@@ -3841,7 +5216,7 @@ func_emit_wrapper ()
 #! $SHELL
 
 # $output - temporary wrapper script for $objdir/$outputname
-# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
 #
 # The $output program cannot be directly executed until all the libtool
 # libraries that it depends on are installed.
@@ -3898,9 +5273,9 @@ _LTECHO_EOF'
 
 # Very basic option parsing. These options are (a) specific to
 # the libtool wrapper, (b) are identical between the wrapper
-# /script/ and the wrapper /executable/ which is used only on
+# /script/ and the wrapper /executable/ that is used only on
 # windows platforms, and (c) all begin with the string "--lt-"
-# (application programs are unlikely to have options which match
+# (application programs are unlikely to have options that match
 # this pattern).
 #
 # There are only two supported options: --lt-debug and
@@ -3933,7 +5308,7 @@ func_parse_lt_options ()
 
   # Print the debug banner immediately:
   if test -n \"\$lt_option_debug\"; then
-    echo \"${outputname}:${output}:\${LINENO}: libtool wrapper (GNU $PACKAGE$TIMESTAMP) $VERSION\" 1>&2
+    echo \"$outputname:$output:\$LINENO: libtool wrapper (GNU $PACKAGE) $VERSION\" 1>&2
   fi
 }
 
@@ -3944,7 +5319,7 @@ func_lt_dump_args ()
   lt_dump_args_N=1;
   for lt_arg
   do
-    \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[\$lt_dump_args_N]: \$lt_arg\"
+    \$ECHO \"$outputname:$output:\$LINENO: newargv[\$lt_dump_args_N]: \$lt_arg\"
     lt_dump_args_N=\`expr \$lt_dump_args_N + 1\`
   done
 }
@@ -3958,7 +5333,7 @@ func_exec_program_core ()
   *-*-mingw | *-*-os2* | *-cegcc*)
     $ECHO "\
       if test -n \"\$lt_option_debug\"; then
-        \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[0]: \$progdir\\\\\$program\" 1>&2
+        \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir\\\\\$program\" 1>&2
         func_lt_dump_args \${1+\"\$@\"} 1>&2
       fi
       exec \"\$progdir\\\\\$program\" \${1+\"\$@\"}
@@ -3968,7 +5343,7 @@ func_exec_program_core ()
   *)
     $ECHO "\
       if test -n \"\$lt_option_debug\"; then
-        \$ECHO \"${outputname}:${output}:\${LINENO}: newargv[0]: \$progdir/\$program\" 1>&2
+        \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir/\$program\" 1>&2
         func_lt_dump_args \${1+\"\$@\"} 1>&2
       fi
       exec \"\$progdir/\$program\" \${1+\"\$@\"}
@@ -4043,13 +5418,13 @@ func_exec_program ()
   test -n \"\$absdir\" && thisdir=\"\$absdir\"
 "
 
-	if test "$fast_install" = yes; then
+	if test yes = "$fast_install"; then
 	  $ECHO "\
   program=lt-'$outputname'$exeext
   progdir=\"\$thisdir/$objdir\"
 
   if test ! -f \"\$progdir/\$program\" ||
-     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | ${SED} 1q\`; \\
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | $SED 1q\`; \\
        test \"X\$file\" != \"X\$progdir/\$program\"; }; then
 
     file=\"\$\$-\$program\"
@@ -4066,7 +5441,7 @@ func_exec_program ()
     if test -n \"\$relink_command\"; then
       if relink_command_output=\`eval \$relink_command 2>&1\`; then :
       else
-	$ECHO \"\$relink_command_output\" >&2
+	\$ECHO \"\$relink_command_output\" >&2
 	$RM \"\$progdir/\$file\"
 	exit 1
       fi
@@ -4101,7 +5476,7 @@ func_exec_program ()
 	fi
 
 	# Export our shlibpath_var if we have one.
-	if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	if test yes = "$shlibpath_overrides_runpath" && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
 	  $ECHO "\
     # Add our own library path to $shlibpath_var
     $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
@@ -4121,7 +5496,7 @@ func_exec_program ()
     fi
   else
     # The program doesn't exist.
-    \$ECHO \"\$0: error: \\\`\$progdir/\$program' does not exist\" 1>&2
+    \$ECHO \"\$0: error: '\$progdir/\$program' does not exist\" 1>&2
     \$ECHO \"This script is just a wrapper for \$program.\" 1>&2
     \$ECHO \"See the $PACKAGE documentation for more information.\" 1>&2
     exit 1
@@ -4140,7 +5515,7 @@ func_emit_cwrapperexe_src ()
 	cat <<EOF
 
 /* $cwrappersource - temporary wrapper executable for $objdir/$outputname
-   Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+   Generated by $PROGRAM (GNU $PACKAGE) $VERSION
 
    The $output program cannot be directly executed until all the libtool
    libraries that it depends on are installed.
@@ -4175,47 +5550,45 @@ EOF
 #include <fcntl.h>
 #include <sys/stat.h>
 
+#define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0)
+
 /* declarations of non-ANSI functions */
-#if defined(__MINGW32__)
+#if defined __MINGW32__
 # ifdef __STRICT_ANSI__
 int _putenv (const char *);
 # endif
-#elif defined(__CYGWIN__)
+#elif defined __CYGWIN__
 # ifdef __STRICT_ANSI__
 char *realpath (const char *, char *);
 int putenv (char *);
 int setenv (const char *, const char *, int);
 # endif
-/* #elif defined (other platforms) ... */
+/* #elif defined other_platform || defined ... */
 #endif
 
 /* portability defines, excluding path handling macros */
-#if defined(_MSC_VER)
+#if defined _MSC_VER
 # define setmode _setmode
 # define stat    _stat
 # define chmod   _chmod
 # define getcwd  _getcwd
 # define putenv  _putenv
 # define S_IXUSR _S_IEXEC
-# ifndef _INTPTR_T_DEFINED
-#  define _INTPTR_T_DEFINED
-#  define intptr_t int
-# endif
-#elif defined(__MINGW32__)
+#elif defined __MINGW32__
 # define setmode _setmode
 # define stat    _stat
 # define chmod   _chmod
 # define getcwd  _getcwd
 # define putenv  _putenv
-#elif defined(__CYGWIN__)
+#elif defined __CYGWIN__
 # define HAVE_SETENV
 # define FOPEN_WB "wb"
-/* #elif defined (other platforms) ... */
+/* #elif defined other platforms ... */
 #endif
 
-#if defined(PATH_MAX)
+#if defined PATH_MAX
 # define LT_PATHMAX PATH_MAX
-#elif defined(MAXPATHLEN)
+#elif defined MAXPATHLEN
 # define LT_PATHMAX MAXPATHLEN
 #else
 # define LT_PATHMAX 1024
@@ -4234,8 +5607,8 @@ int setenv (const char *, const char *, int);
 # define PATH_SEPARATOR ':'
 #endif
 
-#if defined (_WIN32) || defined (__MSDOS__) || defined (__DJGPP__) || \
-  defined (__OS2__)
+#if defined _WIN32 || defined __MSDOS__ || defined __DJGPP__ || \
+  defined __OS2__
 # define HAVE_DOS_BASED_FILE_SYSTEM
 # define FOPEN_WB "wb"
 # ifndef DIR_SEPARATOR_2
@@ -4268,10 +5641,10 @@ int setenv (const char *, const char *, int);
 
 #define XMALLOC(type, num)      ((type *) xmalloc ((num) * sizeof(type)))
 #define XFREE(stale) do { \
-  if (stale) { free ((void *) stale); stale = 0; } \
+  if (stale) { free (stale); stale = 0; } \
 } while (0)
 
-#if defined(LT_DEBUGWRAPPER)
+#if defined LT_DEBUGWRAPPER
 static int lt_debug = 1;
 #else
 static int lt_debug = 0;
@@ -4300,11 +5673,16 @@ void lt_dump_script (FILE *f);
 EOF
 
 	    cat <<EOF
-volatile const char * MAGIC_EXE = "$magic_exe";
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5)
+# define externally_visible volatile
+#else
+# define externally_visible __attribute__((externally_visible)) volatile
+#endif
+externally_visible const char * MAGIC_EXE = "$magic_exe";
 const char * LIB_PATH_VARNAME = "$shlibpath_var";
 EOF
 
-	    if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	    if test yes = "$shlibpath_overrides_runpath" && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
               func_to_host_path "$temp_rpath"
 	      cat <<EOF
 const char * LIB_PATH_VALUE   = "$func_to_host_path_result";
@@ -4328,7 +5706,7 @@ const char * EXE_PATH_VALUE   = "";
 EOF
 	    fi
 
-	    if test "$fast_install" = yes; then
+	    if test yes = "$fast_install"; then
 	      cat <<EOF
 const char * TARGET_PROGRAM_NAME = "lt-$outputname"; /* hopefully, no .exe */
 EOF
@@ -4357,12 +5735,12 @@ main (int argc, char *argv[])
   char *actual_cwrapper_name;
   char *target_name;
   char *lt_argv_zero;
-  intptr_t rval = 127;
+  int rval = 127;
 
   int i;
 
   program_name = (char *) xstrdup (base_name (argv[0]));
-  newargz = XMALLOC (char *, argc + 1);
+  newargz = XMALLOC (char *, (size_t) argc + 1);
 
   /* very simple arg parsing; don't want to rely on getopt
    * also, copy all non cwrapper options to newargz, except
@@ -4371,10 +5749,10 @@ main (int argc, char *argv[])
   newargc=0;
   for (i = 1; i < argc; i++)
     {
-      if (strcmp (argv[i], dumpscript_opt) == 0)
+      if (STREQ (argv[i], dumpscript_opt))
 	{
 EOF
-	    case "$host" in
+	    case $host in
 	      *mingw* | *cygwin* )
 		# make stdout use "unix" line endings
 		echo "          setmode(1,_O_BINARY);"
@@ -4385,12 +5763,12 @@ EOF
 	  lt_dump_script (stdout);
 	  return 0;
 	}
-      if (strcmp (argv[i], debug_opt) == 0)
+      if (STREQ (argv[i], debug_opt))
 	{
           lt_debug = 1;
           continue;
 	}
-      if (strcmp (argv[i], ltwrapper_option_prefix) == 0)
+      if (STREQ (argv[i], ltwrapper_option_prefix))
         {
           /* however, if there is an option in the LTWRAPPER_OPTION_PREFIX
              namespace, but it is not one of the ones we know about and
@@ -4413,7 +5791,7 @@ EOF
 EOF
 	    cat <<EOF
   /* The GNU banner must be the first non-error debug message */
-  lt_debugprintf (__FILE__, __LINE__, "libtool wrapper (GNU $PACKAGE$TIMESTAMP) $VERSION\n");
+  lt_debugprintf (__FILE__, __LINE__, "libtool wrapper (GNU $PACKAGE) $VERSION\n");
 EOF
 	    cat <<"EOF"
   lt_debugprintf (__FILE__, __LINE__, "(main) argv[0]: %s\n", argv[0]);
@@ -4524,7 +5902,7 @@ EOF
 		cat <<"EOF"
   /* execv doesn't actually work on mingw as expected on unix */
   newargz = prepare_spawn (newargz);
-  rval = _spawnv (_P_WAIT, lt_argv_zero, (const char * const *) newargz);
+  rval = (int) _spawnv (_P_WAIT, lt_argv_zero, (const char * const *) newargz);
   if (rval == -1)
     {
       /* failed to start process */
@@ -4569,7 +5947,7 @@ base_name (const char *name)
 {
   const char *base;
 
-#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+#if defined HAVE_DOS_BASED_FILE_SYSTEM
   /* Skip over the disk name in MSDOS pathnames. */
   if (isalpha ((unsigned char) name[0]) && name[1] == ':')
     name += 2;
@@ -4628,7 +6006,7 @@ find_executable (const char *wrapper)
   const char *p_next;
   /* static buffer for getcwd */
   char tmp[LT_PATHMAX + 1];
-  int tmp_len;
+  size_t tmp_len;
   char *concat_name;
 
   lt_debugprintf (__FILE__, __LINE__, "(find_executable): %s\n",
@@ -4638,7 +6016,7 @@ find_executable (const char *wrapper)
     return NULL;
 
   /* Absolute path? */
-#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+#if defined HAVE_DOS_BASED_FILE_SYSTEM
   if (isalpha ((unsigned char) wrapper[0]) && wrapper[1] == ':')
     {
       concat_name = xstrdup (wrapper);
@@ -4656,7 +6034,7 @@ find_executable (const char *wrapper)
 	    return concat_name;
 	  XFREE (concat_name);
 	}
-#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+#if defined HAVE_DOS_BASED_FILE_SYSTEM
     }
 #endif
 
@@ -4679,7 +6057,7 @@ find_executable (const char *wrapper)
 	      for (q = p; *q; q++)
 		if (IS_PATH_SEPARATOR (*q))
 		  break;
-	      p_len = q - p;
+	      p_len = (size_t) (q - p);
 	      p_next = (*q == '\0' ? q : q + 1);
 	      if (p_len == 0)
 		{
@@ -4798,7 +6176,7 @@ strendzap (char *str, const char *pat)
   if (patlen <= len)
     {
       str += len - patlen;
-      if (strcmp (str, pat) == 0)
+      if (STREQ (str, pat))
 	*str = '\0';
     }
   return str;
@@ -4863,7 +6241,7 @@ lt_setenv (const char *name, const char *value)
     char *str = xstrdup (value);
     setenv (name, str, 1);
 #else
-    int len = strlen (name) + 1 + strlen (value) + 1;
+    size_t len = strlen (name) + 1 + strlen (value) + 1;
     char *str = XMALLOC (char, len);
     sprintf (str, "%s=%s", name, value);
     if (putenv (str) != EXIT_SUCCESS)
@@ -4880,8 +6258,8 @@ lt_extend_str (const char *orig_value, const char *add, int to_end)
   char *new_value;
   if (orig_value && *orig_value)
     {
-      int orig_value_len = strlen (orig_value);
-      int add_len = strlen (add);
+      size_t orig_value_len = strlen (orig_value);
+      size_t add_len = strlen (add);
       new_value = XMALLOC (char, add_len + orig_value_len + 1);
       if (to_end)
         {
@@ -4912,10 +6290,10 @@ lt_update_exe_path (const char *name, const char *value)
     {
       char *new_value = lt_extend_str (getenv (name), value, 0);
       /* some systems can't cope with a ':'-terminated path #' */
-      int len = strlen (new_value);
-      while (((len = strlen (new_value)) > 0) && IS_PATH_SEPARATOR (new_value[len-1]))
+      size_t len = strlen (new_value);
+      while ((len > 0) && IS_PATH_SEPARATOR (new_value[len-1]))
         {
-          new_value[len-1] = '\0';
+          new_value[--len] = '\0';
         }
       lt_setenv (name, new_value);
       XFREE (new_value);
@@ -5082,27 +6460,47 @@ EOF
 # True if ARG is an import lib, as indicated by $file_magic_cmd
 func_win32_import_lib_p ()
 {
-    $opt_debug
+    $debug_cmd
+
     case `eval $file_magic_cmd \"\$1\" 2>/dev/null | $SED -e 10q` in
     *import*) : ;;
     *) false ;;
     esac
 }
 
+# func_suncc_cstd_abi
+# !!ONLY CALL THIS FOR SUN CC AFTER $compile_command IS FULLY EXPANDED!!
+# Several compiler flags select an ABI that is incompatible with the
+# Cstd library. Avoid specifying it if any are in CXXFLAGS.
+func_suncc_cstd_abi ()
+{
+    $debug_cmd
+
+    case " $compile_command " in
+    *" -compat=g "*|*\ -std=c++[0-9][0-9]\ *|*" -library=stdcxx4 "*|*" -library=stlport4 "*)
+      suncc_use_cstd_abi=no
+      ;;
+    *)
+      suncc_use_cstd_abi=yes
+      ;;
+    esac
+}
+
 # func_mode_link arg...
 func_mode_link ()
 {
-    $opt_debug
+    $debug_cmd
+
     case $host in
     *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
       # It is impossible to link a dll without this setting, and
       # we shouldn't force the makefile maintainer to figure out
-      # which system we are compiling for in order to pass an extra
+      # what system we are compiling for in order to pass an extra
       # flag for every libtool invocation.
       # allow_undefined=no
 
       # FIXME: Unfortunately, there are problems with the above when trying
-      # to make a dll which has undefined symbols, in which case not
+      # to make a dll that has undefined symbols, in which case not
       # even a static library is built.  For now, we need to specify
       # -no-undefined on the libtool link line when we can be certain
       # that all symbols are satisfied, otherwise we get a static library.
@@ -5146,10 +6544,11 @@ func_mode_link ()
     module=no
     no_install=no
     objs=
+    os2dllname=
     non_pic_objects=
     precious_files_regex=
     prefer_static_libs=no
-    preload=no
+    preload=false
     prev=
     prevarg=
     release=
@@ -5161,7 +6560,7 @@ func_mode_link ()
     vinfo=
     vinfo_number=no
     weak_libs=
-    single_module="${wl}-single_module"
+    single_module=$wl-single_module
     func_infer_tag $base_compile
 
     # We need to know -static, to get the right output filenames.
@@ -5169,15 +6568,15 @@ func_mode_link ()
     do
       case $arg in
       -shared)
-	test "$build_libtool_libs" != yes && \
-	  func_fatal_configuration "can not build a shared library"
+	test yes != "$build_libtool_libs" \
+	  && func_fatal_configuration "cannot build a shared library"
 	build_old_libs=no
 	break
 	;;
       -all-static | -static | -static-libtool-libs)
 	case $arg in
 	-all-static)
-	  if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+	  if test yes = "$build_libtool_libs" && test -z "$link_static_flag"; then
 	    func_warning "complete static linking is impossible in this configuration"
 	  fi
 	  if test -n "$link_static_flag"; then
@@ -5210,7 +6609,7 @@ func_mode_link ()
 
     # Go through the arguments, transforming them on the way.
     while test "$#" -gt 0; do
-      arg="$1"
+      arg=$1
       shift
       func_quote_for_eval "$arg"
       qarg=$func_quote_for_eval_unquoted_result
@@ -5227,21 +6626,21 @@ func_mode_link ()
 
 	case $prev in
 	bindir)
-	  bindir="$arg"
+	  bindir=$arg
 	  prev=
 	  continue
 	  ;;
 	dlfiles|dlprefiles)
-	  if test "$preload" = no; then
+	  $preload || {
 	    # Add the symbol object into the linking commands.
 	    func_append compile_command " @SYMFILE@"
 	    func_append finalize_command " @SYMFILE@"
-	    preload=yes
-	  fi
+	    preload=:
+	  }
 	  case $arg in
 	  *.la | *.lo) ;;  # We handle these cases below.
 	  force)
-	    if test "$dlself" = no; then
+	    if test no = "$dlself"; then
 	      dlself=needless
 	      export_dynamic=yes
 	    fi
@@ -5249,9 +6648,9 @@ func_mode_link ()
 	    continue
 	    ;;
 	  self)
-	    if test "$prev" = dlprefiles; then
+	    if test dlprefiles = "$prev"; then
 	      dlself=yes
-	    elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+	    elif test dlfiles = "$prev" && test yes != "$dlopen_self"; then
 	      dlself=yes
 	    else
 	      dlself=needless
@@ -5261,7 +6660,7 @@ func_mode_link ()
 	    continue
 	    ;;
 	  *)
-	    if test "$prev" = dlfiles; then
+	    if test dlfiles = "$prev"; then
 	      func_append dlfiles " $arg"
 	    else
 	      func_append dlprefiles " $arg"
@@ -5272,14 +6671,14 @@ func_mode_link ()
 	  esac
 	  ;;
 	expsyms)
-	  export_symbols="$arg"
+	  export_symbols=$arg
 	  test -f "$arg" \
-	    || func_fatal_error "symbol file \`$arg' does not exist"
+	    || func_fatal_error "symbol file '$arg' does not exist"
 	  prev=
 	  continue
 	  ;;
 	expsyms_regex)
-	  export_symbols_regex="$arg"
+	  export_symbols_regex=$arg
 	  prev=
 	  continue
 	  ;;
@@ -5297,7 +6696,13 @@ func_mode_link ()
 	  continue
 	  ;;
 	inst_prefix)
-	  inst_prefix_dir="$arg"
+	  inst_prefix_dir=$arg
+	  prev=
+	  continue
+	  ;;
+	mllvm)
+	  # Clang does not use LLVM to link, so we can simply discard any
+	  # '-mllvm $arg' options when doing the link step.
 	  prev=
 	  continue
 	  ;;
@@ -5321,21 +6726,21 @@ func_mode_link ()
 
 		if test -z "$pic_object" ||
 		   test -z "$non_pic_object" ||
-		   test "$pic_object" = none &&
-		   test "$non_pic_object" = none; then
-		  func_fatal_error "cannot find name of object for \`$arg'"
+		   test none = "$pic_object" &&
+		   test none = "$non_pic_object"; then
+		  func_fatal_error "cannot find name of object for '$arg'"
 		fi
 
 		# Extract subdirectory from the argument.
 		func_dirname "$arg" "/" ""
-		xdir="$func_dirname_result"
+		xdir=$func_dirname_result
 
-		if test "$pic_object" != none; then
+		if test none != "$pic_object"; then
 		  # Prepend the subdirectory the object is found in.
-		  pic_object="$xdir$pic_object"
+		  pic_object=$xdir$pic_object
 
-		  if test "$prev" = dlfiles; then
-		    if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+		  if test dlfiles = "$prev"; then
+		    if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then
 		      func_append dlfiles " $pic_object"
 		      prev=
 		      continue
@@ -5346,7 +6751,7 @@ func_mode_link ()
 		  fi
 
 		  # CHECK ME:  I think I busted this.  -Ossama
-		  if test "$prev" = dlprefiles; then
+		  if test dlprefiles = "$prev"; then
 		    # Preload the old-style object.
 		    func_append dlprefiles " $pic_object"
 		    prev=
@@ -5354,23 +6759,23 @@ func_mode_link ()
 
 		  # A PIC object.
 		  func_append libobjs " $pic_object"
-		  arg="$pic_object"
+		  arg=$pic_object
 		fi
 
 		# Non-PIC object.
-		if test "$non_pic_object" != none; then
+		if test none != "$non_pic_object"; then
 		  # Prepend the subdirectory the object is found in.
-		  non_pic_object="$xdir$non_pic_object"
+		  non_pic_object=$xdir$non_pic_object
 
 		  # A standard non-PIC object
 		  func_append non_pic_objects " $non_pic_object"
-		  if test -z "$pic_object" || test "$pic_object" = none ; then
-		    arg="$non_pic_object"
+		  if test -z "$pic_object" || test none = "$pic_object"; then
+		    arg=$non_pic_object
 		  fi
 		else
 		  # If the PIC object exists, use it instead.
 		  # $xdir was prepended to $pic_object above.
-		  non_pic_object="$pic_object"
+		  non_pic_object=$pic_object
 		  func_append non_pic_objects " $non_pic_object"
 		fi
 	      else
@@ -5378,7 +6783,7 @@ func_mode_link ()
 		if $opt_dry_run; then
 		  # Extract subdirectory from the argument.
 		  func_dirname "$arg" "/" ""
-		  xdir="$func_dirname_result"
+		  xdir=$func_dirname_result
 
 		  func_lo2o "$arg"
 		  pic_object=$xdir$objdir/$func_lo2o_result
@@ -5386,24 +6791,29 @@ func_mode_link ()
 		  func_append libobjs " $pic_object"
 		  func_append non_pic_objects " $non_pic_object"
 	        else
-		  func_fatal_error "\`$arg' is not a valid libtool object"
+		  func_fatal_error "'$arg' is not a valid libtool object"
 		fi
 	      fi
 	    done
 	  else
-	    func_fatal_error "link input file \`$arg' does not exist"
+	    func_fatal_error "link input file '$arg' does not exist"
 	  fi
 	  arg=$save_arg
 	  prev=
 	  continue
 	  ;;
+	os2dllname)
+	  os2dllname=$arg
+	  prev=
+	  continue
+	  ;;
 	precious_regex)
-	  precious_files_regex="$arg"
+	  precious_files_regex=$arg
 	  prev=
 	  continue
 	  ;;
 	release)
-	  release="-$arg"
+	  release=-$arg
 	  prev=
 	  continue
 	  ;;
@@ -5415,7 +6825,7 @@ func_mode_link ()
 	    func_fatal_error "only absolute run-paths are allowed"
 	    ;;
 	  esac
-	  if test "$prev" = rpath; then
+	  if test rpath = "$prev"; then
 	    case "$rpath " in
 	    *" $arg "*) ;;
 	    *) func_append rpath " $arg" ;;
@@ -5430,7 +6840,7 @@ func_mode_link ()
 	  continue
 	  ;;
 	shrext)
-	  shrext_cmds="$arg"
+	  shrext_cmds=$arg
 	  prev=
 	  continue
 	  ;;
@@ -5470,7 +6880,7 @@ func_mode_link ()
 	esac
       fi # test -n "$prev"
 
-      prevarg="$arg"
+      prevarg=$arg
 
       case $arg in
       -all-static)
@@ -5484,7 +6894,7 @@ func_mode_link ()
 
       -allow-undefined)
 	# FIXME: remove this flag sometime in the future.
-	func_fatal_error "\`-allow-undefined' must not be used because it is the default"
+	func_fatal_error "'-allow-undefined' must not be used because it is the default"
 	;;
 
       -avoid-version)
@@ -5516,7 +6926,7 @@ func_mode_link ()
 	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
 	  func_fatal_error "more than one -exported-symbols argument is not allowed"
 	fi
-	if test "X$arg" = "X-export-symbols"; then
+	if test X-export-symbols = "X$arg"; then
 	  prev=expsyms
 	else
 	  prev=expsyms_regex
@@ -5550,9 +6960,9 @@ func_mode_link ()
 	func_stripname "-L" '' "$arg"
 	if test -z "$func_stripname_result"; then
 	  if test "$#" -gt 0; then
-	    func_fatal_error "require no space between \`-L' and \`$1'"
+	    func_fatal_error "require no space between '-L' and '$1'"
 	  else
-	    func_fatal_error "need path for \`-L' option"
+	    func_fatal_error "need path for '-L' option"
 	  fi
 	fi
 	func_resolve_sysroot "$func_stripname_result"
@@ -5563,8 +6973,8 @@ func_mode_link ()
 	*)
 	  absdir=`cd "$dir" && pwd`
 	  test -z "$absdir" && \
-	    func_fatal_error "cannot determine absolute directory name of \`$dir'"
-	  dir="$absdir"
+	    func_fatal_error "cannot determine absolute directory name of '$dir'"
+	  dir=$absdir
 	  ;;
 	esac
 	case "$deplibs " in
@@ -5599,7 +7009,7 @@ func_mode_link ()
 	;;
 
       -l*)
-	if test "X$arg" = "X-lc" || test "X$arg" = "X-lm"; then
+	if test X-lc = "X$arg" || test X-lm = "X$arg"; then
 	  case $host in
 	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-beos* | *-cegcc* | *-*-haiku*)
 	    # These systems don't actually have a C or math library (as such)
@@ -5607,11 +7017,11 @@ func_mode_link ()
 	    ;;
 	  *-*-os2*)
 	    # These systems don't actually have a C library (as such)
-	    test "X$arg" = "X-lc" && continue
+	    test X-lc = "X$arg" && continue
 	    ;;
-	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
 	    # Do not include libc due to us having libc/libc_r.
-	    test "X$arg" = "X-lc" && continue
+	    test X-lc = "X$arg" && continue
 	    ;;
 	  *-*-rhapsody* | *-*-darwin1.[012])
 	    # Rhapsody C and math libraries are in the System framework
@@ -5620,16 +7030,16 @@ func_mode_link ()
 	    ;;
 	  *-*-sco3.2v5* | *-*-sco5v6*)
 	    # Causes problems with __ctype
-	    test "X$arg" = "X-lc" && continue
+	    test X-lc = "X$arg" && continue
 	    ;;
 	  *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*)
 	    # Compiler inserts libc in the correct place for threads to work
-	    test "X$arg" = "X-lc" && continue
+	    test X-lc = "X$arg" && continue
 	    ;;
 	  esac
-	elif test "X$arg" = "X-lc_r"; then
+	elif test X-lc_r = "X$arg"; then
 	 case $host in
-	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
+	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
 	   # Do not include libc_r directly, use -pthread flag.
 	   continue
 	   ;;
@@ -5639,6 +7049,11 @@ func_mode_link ()
 	continue
 	;;
 
+      -mllvm)
+	prev=mllvm
+	continue
+	;;
+
       -module)
 	module=yes
 	continue
@@ -5668,7 +7083,7 @@ func_mode_link ()
 	;;
 
       -multi_module)
-	single_module="${wl}-multi_module"
+	single_module=$wl-multi_module
 	continue
 	;;
 
@@ -5682,8 +7097,8 @@ func_mode_link ()
 	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-darwin* | *-cegcc*)
 	  # The PATH hackery in wrapper scripts is required on Windows
 	  # and Darwin in order for the loader to find any dlls it needs.
-	  func_warning "\`-no-install' is ignored for $host"
-	  func_warning "assuming \`-no-fast-install' instead"
+	  func_warning "'-no-install' is ignored for $host"
+	  func_warning "assuming '-no-fast-install' instead"
 	  fast_install=no
 	  ;;
 	*) no_install=yes ;;
@@ -5701,6 +7116,11 @@ func_mode_link ()
 	continue
 	;;
 
+      -os2dllname)
+	prev=os2dllname
+	continue
+	;;
+
       -o) prev=output ;;
 
       -precious-files-regex)
@@ -5788,14 +7208,14 @@ func_mode_link ()
 	func_stripname '-Wc,' '' "$arg"
 	args=$func_stripname_result
 	arg=
-	save_ifs="$IFS"; IFS=','
+	save_ifs=$IFS; IFS=,
 	for flag in $args; do
-	  IFS="$save_ifs"
+	  IFS=$save_ifs
           func_quote_for_eval "$flag"
 	  func_append arg " $func_quote_for_eval_result"
 	  func_append compiler_flags " $func_quote_for_eval_result"
 	done
-	IFS="$save_ifs"
+	IFS=$save_ifs
 	func_stripname ' ' '' "$arg"
 	arg=$func_stripname_result
 	;;
@@ -5804,15 +7224,15 @@ func_mode_link ()
 	func_stripname '-Wl,' '' "$arg"
 	args=$func_stripname_result
 	arg=
-	save_ifs="$IFS"; IFS=','
+	save_ifs=$IFS; IFS=,
 	for flag in $args; do
-	  IFS="$save_ifs"
+	  IFS=$save_ifs
           func_quote_for_eval "$flag"
 	  func_append arg " $wl$func_quote_for_eval_result"
 	  func_append compiler_flags " $wl$func_quote_for_eval_result"
 	  func_append linker_flags " $func_quote_for_eval_result"
 	done
-	IFS="$save_ifs"
+	IFS=$save_ifs
 	func_stripname ' ' '' "$arg"
 	arg=$func_stripname_result
 	;;
@@ -5835,7 +7255,7 @@ func_mode_link ()
       # -msg_* for osf cc
       -msg_*)
 	func_quote_for_eval "$arg"
-	arg="$func_quote_for_eval_result"
+	arg=$func_quote_for_eval_result
 	;;
 
       # Flags to be passed through unchanged, with rationale:
@@ -5847,25 +7267,46 @@ func_mode_link ()
       # -m*, -t[45]*, -txscale* architecture-specific flags for GCC
       # -F/path              path to uninstalled frameworks, gcc on darwin
       # -p, -pg, --coverage, -fprofile-*  profiling flags for GCC
+      # -fstack-protector*   stack protector flags for GCC
       # @file                GCC response files
       # -tp=*                Portland pgcc target processor selection
       # --sysroot=*          for sysroot support
-      # -O*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization
+      # -O*, -g*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization
+      # -stdlib=*            select c++ std lib with clang
       -64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \
       -t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \
-      -O*|-flto*|-fwhopr*|-fuse-linker-plugin)
+      -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*)
         func_quote_for_eval "$arg"
-	arg="$func_quote_for_eval_result"
+	arg=$func_quote_for_eval_result
         func_append compile_command " $arg"
         func_append finalize_command " $arg"
         func_append compiler_flags " $arg"
         continue
         ;;
 
+      -Z*)
+        if test os2 = "`expr $host : '.*\(os2\)'`"; then
+          # OS/2 uses -Zxxx to specify OS/2-specific options
+	  compiler_flags="$compiler_flags $arg"
+	  func_append compile_command " $arg"
+	  func_append finalize_command " $arg"
+	  case $arg in
+	  -Zlinker | -Zstack)
+	    prev=xcompiler
+	    ;;
+	  esac
+	  continue
+        else
+	  # Otherwise treat like 'Some other compiler flag' below
+	  func_quote_for_eval "$arg"
+	  arg=$func_quote_for_eval_result
+        fi
+	;;
+
       # Some other compiler flag.
       -* | +*)
         func_quote_for_eval "$arg"
-	arg="$func_quote_for_eval_result"
+	arg=$func_quote_for_eval_result
 	;;
 
       *.$objext)
@@ -5886,21 +7327,21 @@ func_mode_link ()
 
 	  if test -z "$pic_object" ||
 	     test -z "$non_pic_object" ||
-	     test "$pic_object" = none &&
-	     test "$non_pic_object" = none; then
-	    func_fatal_error "cannot find name of object for \`$arg'"
+	     test none = "$pic_object" &&
+	     test none = "$non_pic_object"; then
+	    func_fatal_error "cannot find name of object for '$arg'"
 	  fi
 
 	  # Extract subdirectory from the argument.
 	  func_dirname "$arg" "/" ""
-	  xdir="$func_dirname_result"
+	  xdir=$func_dirname_result
 
-	  if test "$pic_object" != none; then
+	  test none = "$pic_object" || {
 	    # Prepend the subdirectory the object is found in.
-	    pic_object="$xdir$pic_object"
+	    pic_object=$xdir$pic_object
 
-	    if test "$prev" = dlfiles; then
-	      if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+	    if test dlfiles = "$prev"; then
+	      if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then
 		func_append dlfiles " $pic_object"
 		prev=
 		continue
@@ -5911,7 +7352,7 @@ func_mode_link ()
 	    fi
 
 	    # CHECK ME:  I think I busted this.  -Ossama
-	    if test "$prev" = dlprefiles; then
+	    if test dlprefiles = "$prev"; then
 	      # Preload the old-style object.
 	      func_append dlprefiles " $pic_object"
 	      prev=
@@ -5919,23 +7360,23 @@ func_mode_link ()
 
 	    # A PIC object.
 	    func_append libobjs " $pic_object"
-	    arg="$pic_object"
-	  fi
+	    arg=$pic_object
+	  }
 
 	  # Non-PIC object.
-	  if test "$non_pic_object" != none; then
+	  if test none != "$non_pic_object"; then
 	    # Prepend the subdirectory the object is found in.
-	    non_pic_object="$xdir$non_pic_object"
+	    non_pic_object=$xdir$non_pic_object
 
 	    # A standard non-PIC object
 	    func_append non_pic_objects " $non_pic_object"
-	    if test -z "$pic_object" || test "$pic_object" = none ; then
-	      arg="$non_pic_object"
+	    if test -z "$pic_object" || test none = "$pic_object"; then
+	      arg=$non_pic_object
 	    fi
 	  else
 	    # If the PIC object exists, use it instead.
 	    # $xdir was prepended to $pic_object above.
-	    non_pic_object="$pic_object"
+	    non_pic_object=$pic_object
 	    func_append non_pic_objects " $non_pic_object"
 	  fi
 	else
@@ -5943,7 +7384,7 @@ func_mode_link ()
 	  if $opt_dry_run; then
 	    # Extract subdirectory from the argument.
 	    func_dirname "$arg" "/" ""
-	    xdir="$func_dirname_result"
+	    xdir=$func_dirname_result
 
 	    func_lo2o "$arg"
 	    pic_object=$xdir$objdir/$func_lo2o_result
@@ -5951,7 +7392,7 @@ func_mode_link ()
 	    func_append libobjs " $pic_object"
 	    func_append non_pic_objects " $non_pic_object"
 	  else
-	    func_fatal_error "\`$arg' is not a valid libtool object"
+	    func_fatal_error "'$arg' is not a valid libtool object"
 	  fi
 	fi
 	;;
@@ -5967,11 +7408,11 @@ func_mode_link ()
 	# A libtool-controlled library.
 
 	func_resolve_sysroot "$arg"
-	if test "$prev" = dlfiles; then
+	if test dlfiles = "$prev"; then
 	  # This library was specified with -dlopen.
 	  func_append dlfiles " $func_resolve_sysroot_result"
 	  prev=
-	elif test "$prev" = dlprefiles; then
+	elif test dlprefiles = "$prev"; then
 	  # The library was specified with -dlpreopen.
 	  func_append dlprefiles " $func_resolve_sysroot_result"
 	  prev=
@@ -5986,7 +7427,7 @@ func_mode_link ()
 	# Unknown arguments in both finalize_command and compile_command need
 	# to be aesthetically quoted because they are evaled later.
 	func_quote_for_eval "$arg"
-	arg="$func_quote_for_eval_result"
+	arg=$func_quote_for_eval_result
 	;;
       esac # arg
 
@@ -5998,9 +7439,9 @@ func_mode_link ()
     done # argument parsing loop
 
     test -n "$prev" && \
-      func_fatal_help "the \`$prevarg' option requires an argument"
+      func_fatal_help "the '$prevarg' option requires an argument"
 
-    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+    if test yes = "$export_dynamic" && test -n "$export_dynamic_flag_spec"; then
       eval arg=\"$export_dynamic_flag_spec\"
       func_append compile_command " $arg"
       func_append finalize_command " $arg"
@@ -6009,20 +7450,23 @@ func_mode_link ()
     oldlibs=
     # calculate the name of the file, without its directory
     func_basename "$output"
-    outputname="$func_basename_result"
-    libobjs_save="$libobjs"
+    outputname=$func_basename_result
+    libobjs_save=$libobjs
 
     if test -n "$shlibpath_var"; then
       # get the directories listed in $shlibpath_var
-      eval shlib_search_path=\`\$ECHO \"\${$shlibpath_var}\" \| \$SED \'s/:/ /g\'\`
+      eval shlib_search_path=\`\$ECHO \"\$$shlibpath_var\" \| \$SED \'s/:/ /g\'\`
     else
       shlib_search_path=
     fi
     eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
     eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
 
+    # Definition is injected by LT_CONFIG during libtool generation.
+    func_munge_path_list sys_lib_dlsearch_path "$LT_SYS_LIBRARY_PATH"
+
     func_dirname "$output" "/" ""
-    output_objdir="$func_dirname_result$objdir"
+    output_objdir=$func_dirname_result$objdir
     func_to_tool_file "$output_objdir/"
     tool_output_objdir=$func_to_tool_file_result
     # Create the object directory.
@@ -6045,7 +7489,7 @@ func_mode_link ()
     # Find all interdependent deplibs by searching for libraries
     # that are linked more than once (e.g. -la -lb -la)
     for deplib in $deplibs; do
-      if $opt_preserve_dup_deps ; then
+      if $opt_preserve_dup_deps; then
 	case "$libs " in
 	*" $deplib "*) func_append specialdeplibs " $deplib" ;;
 	esac
@@ -6053,7 +7497,7 @@ func_mode_link ()
       func_append libs " $deplib"
     done
 
-    if test "$linkmode" = lib; then
+    if test lib = "$linkmode"; then
       libs="$predeps $libs $compiler_lib_search_path $postdeps"
 
       # Compute libraries that are listed more than once in $predeps
@@ -6085,7 +7529,7 @@ func_mode_link ()
 	  case $file in
 	  *.la) ;;
 	  *)
-	    func_fatal_help "libraries can \`-dlopen' only libtool libraries: $file"
+	    func_fatal_help "libraries can '-dlopen' only libtool libraries: $file"
 	    ;;
 	  esac
 	done
@@ -6093,7 +7537,7 @@ func_mode_link ()
     prog)
 	compile_deplibs=
 	finalize_deplibs=
-	alldeplibs=no
+	alldeplibs=false
 	newdlfiles=
 	newdlprefiles=
 	passes="conv scan dlopen dlpreopen link"
@@ -6105,29 +7549,29 @@ func_mode_link ()
     for pass in $passes; do
       # The preopen pass in lib mode reverses $deplibs; put it back here
       # so that -L comes before libs that need it for instance...
-      if test "$linkmode,$pass" = "lib,link"; then
+      if test lib,link = "$linkmode,$pass"; then
 	## FIXME: Find the place where the list is rebuilt in the wrong
 	##        order, and fix it there properly
         tmp_deplibs=
 	for deplib in $deplibs; do
 	  tmp_deplibs="$deplib $tmp_deplibs"
 	done
-	deplibs="$tmp_deplibs"
+	deplibs=$tmp_deplibs
       fi
 
-      if test "$linkmode,$pass" = "lib,link" ||
-	 test "$linkmode,$pass" = "prog,scan"; then
-	libs="$deplibs"
+      if test lib,link = "$linkmode,$pass" ||
+	 test prog,scan = "$linkmode,$pass"; then
+	libs=$deplibs
 	deplibs=
       fi
-      if test "$linkmode" = prog; then
+      if test prog = "$linkmode"; then
 	case $pass in
-	dlopen) libs="$dlfiles" ;;
-	dlpreopen) libs="$dlprefiles" ;;
+	dlopen) libs=$dlfiles ;;
+	dlpreopen) libs=$dlprefiles ;;
 	link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
 	esac
       fi
-      if test "$linkmode,$pass" = "lib,dlpreopen"; then
+      if test lib,dlpreopen = "$linkmode,$pass"; then
 	# Collect and forward deplibs of preopened libtool libs
 	for lib in $dlprefiles; do
 	  # Ignore non-libtool-libs
@@ -6148,26 +7592,26 @@ func_mode_link ()
 	    esac
 	  done
 	done
-	libs="$dlprefiles"
+	libs=$dlprefiles
       fi
-      if test "$pass" = dlopen; then
+      if test dlopen = "$pass"; then
 	# Collect dlpreopened libraries
-	save_deplibs="$deplibs"
+	save_deplibs=$deplibs
 	deplibs=
       fi
 
       for deplib in $libs; do
 	lib=
-	found=no
+	found=false
 	case $deplib in
 	-mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
         |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
-	  if test "$linkmode,$pass" = "prog,link"; then
+	  if test prog,link = "$linkmode,$pass"; then
 	    compile_deplibs="$deplib $compile_deplibs"
 	    finalize_deplibs="$deplib $finalize_deplibs"
 	  else
 	    func_append compiler_flags " $deplib"
-	    if test "$linkmode" = lib ; then
+	    if test lib = "$linkmode"; then
 		case "$new_inherited_linker_flags " in
 		    *" $deplib "*) ;;
 		    * ) func_append new_inherited_linker_flags " $deplib" ;;
@@ -6177,13 +7621,13 @@ func_mode_link ()
 	  continue
 	  ;;
 	-l*)
-	  if test "$linkmode" != lib && test "$linkmode" != prog; then
-	    func_warning "\`-l' is ignored for archives/objects"
+	  if test lib != "$linkmode" && test prog != "$linkmode"; then
+	    func_warning "'-l' is ignored for archives/objects"
 	    continue
 	  fi
 	  func_stripname '-l' '' "$deplib"
 	  name=$func_stripname_result
-	  if test "$linkmode" = lib; then
+	  if test lib = "$linkmode"; then
 	    searchdirs="$newlib_search_path $lib_search_path $compiler_lib_search_dirs $sys_lib_search_path $shlib_search_path"
 	  else
 	    searchdirs="$newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path"
@@ -6191,31 +7635,22 @@ func_mode_link ()
 	  for searchdir in $searchdirs; do
 	    for search_ext in .la $std_shrext .so .a; do
 	      # Search the libtool library
-	      lib="$searchdir/lib${name}${search_ext}"
+	      lib=$searchdir/lib$name$search_ext
 	      if test -f "$lib"; then
-		if test "$search_ext" = ".la"; then
-		  found=yes
+		if test .la = "$search_ext"; then
+		  found=:
 		else
-		  found=no
+		  found=false
 		fi
 		break 2
 	      fi
 	    done
 	  done
-	  if test "$found" != yes; then
-	    # deplib doesn't seem to be a libtool library
-	    if test "$linkmode,$pass" = "prog,link"; then
-	      compile_deplibs="$deplib $compile_deplibs"
-	      finalize_deplibs="$deplib $finalize_deplibs"
-	    else
-	      deplibs="$deplib $deplibs"
-	      test "$linkmode" = lib && newdependency_libs="$deplib $newdependency_libs"
-	    fi
-	    continue
-	  else # deplib is a libtool library
+	  if $found; then
+	    # deplib is a libtool library
 	    # If $allow_libtool_libs_with_static_runtimes && $deplib is a stdlib,
 	    # We need to do some special things here, and not later.
-	    if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+	    if test yes = "$allow_libtool_libs_with_static_runtimes"; then
 	      case " $predeps $postdeps " in
 	      *" $deplib "*)
 		if func_lalib_p "$lib"; then
@@ -6223,19 +7658,19 @@ func_mode_link ()
 		  old_library=
 		  func_source "$lib"
 		  for l in $old_library $library_names; do
-		    ll="$l"
+		    ll=$l
 		  done
-		  if test "X$ll" = "X$old_library" ; then # only static version available
-		    found=no
+		  if test "X$ll" = "X$old_library"; then # only static version available
+		    found=false
 		    func_dirname "$lib" "" "."
-		    ladir="$func_dirname_result"
+		    ladir=$func_dirname_result
 		    lib=$ladir/$old_library
-		    if test "$linkmode,$pass" = "prog,link"; then
+		    if test prog,link = "$linkmode,$pass"; then
 		      compile_deplibs="$deplib $compile_deplibs"
 		      finalize_deplibs="$deplib $finalize_deplibs"
 		    else
 		      deplibs="$deplib $deplibs"
-		      test "$linkmode" = lib && newdependency_libs="$deplib $newdependency_libs"
+		      test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs"
 		    fi
 		    continue
 		  fi
@@ -6244,15 +7679,25 @@ func_mode_link ()
 	      *) ;;
 	      esac
 	    fi
+	  else
+	    # deplib doesn't seem to be a libtool library
+	    if test prog,link = "$linkmode,$pass"; then
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      deplibs="$deplib $deplibs"
+	      test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    continue
 	  fi
 	  ;; # -l
 	*.ltframework)
-	  if test "$linkmode,$pass" = "prog,link"; then
+	  if test prog,link = "$linkmode,$pass"; then
 	    compile_deplibs="$deplib $compile_deplibs"
 	    finalize_deplibs="$deplib $finalize_deplibs"
 	  else
 	    deplibs="$deplib $deplibs"
-	    if test "$linkmode" = lib ; then
+	    if test lib = "$linkmode"; then
 		case "$new_inherited_linker_flags " in
 		    *" $deplib "*) ;;
 		    * ) func_append new_inherited_linker_flags " $deplib" ;;
@@ -6265,18 +7710,18 @@ func_mode_link ()
 	  case $linkmode in
 	  lib)
 	    deplibs="$deplib $deplibs"
-	    test "$pass" = conv && continue
+	    test conv = "$pass" && continue
 	    newdependency_libs="$deplib $newdependency_libs"
 	    func_stripname '-L' '' "$deplib"
 	    func_resolve_sysroot "$func_stripname_result"
 	    func_append newlib_search_path " $func_resolve_sysroot_result"
 	    ;;
 	  prog)
-	    if test "$pass" = conv; then
+	    if test conv = "$pass"; then
 	      deplibs="$deplib $deplibs"
 	      continue
 	    fi
-	    if test "$pass" = scan; then
+	    if test scan = "$pass"; then
 	      deplibs="$deplib $deplibs"
 	    else
 	      compile_deplibs="$deplib $compile_deplibs"
@@ -6287,13 +7732,13 @@ func_mode_link ()
 	    func_append newlib_search_path " $func_resolve_sysroot_result"
 	    ;;
 	  *)
-	    func_warning "\`-L' is ignored for archives/objects"
+	    func_warning "'-L' is ignored for archives/objects"
 	    ;;
 	  esac # linkmode
 	  continue
 	  ;; # -L
 	-R*)
-	  if test "$pass" = link; then
+	  if test link = "$pass"; then
 	    func_stripname '-R' '' "$deplib"
 	    func_resolve_sysroot "$func_stripname_result"
 	    dir=$func_resolve_sysroot_result
@@ -6311,7 +7756,7 @@ func_mode_link ()
 	  lib=$func_resolve_sysroot_result
 	  ;;
 	*.$libext)
-	  if test "$pass" = conv; then
+	  if test conv = "$pass"; then
 	    deplibs="$deplib $deplibs"
 	    continue
 	  fi
@@ -6322,21 +7767,26 @@ func_mode_link ()
 	    case " $dlpreconveniencelibs " in
 	    *" $deplib "*) ;;
 	    *)
-	      valid_a_lib=no
+	      valid_a_lib=false
 	      case $deplibs_check_method in
 		match_pattern*)
 		  set dummy $deplibs_check_method; shift
 		  match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
 		  if eval "\$ECHO \"$deplib\"" 2>/dev/null | $SED 10q \
 		    | $EGREP "$match_pattern_regex" > /dev/null; then
-		    valid_a_lib=yes
+		    valid_a_lib=:
 		  fi
 		;;
 		pass_all)
-		  valid_a_lib=yes
+		  valid_a_lib=:
 		;;
 	      esac
-	      if test "$valid_a_lib" != yes; then
+	      if $valid_a_lib; then
+		echo
+		$ECHO "*** Warning: Linking the shared library $output against the"
+		$ECHO "*** static library $deplib is not portable!"
+		deplibs="$deplib $deplibs"
+	      else
 		echo
 		$ECHO "*** Warning: Trying to link with static lib archive $deplib."
 		echo "*** I have the capability to make that library automatically link in when"
@@ -6344,18 +7794,13 @@ func_mode_link ()
 		echo "*** shared version of the library, which you do not appear to have"
 		echo "*** because the file extensions .$libext of this argument makes me believe"
 		echo "*** that it is just a static archive that I should not use here."
-	      else
-		echo
-		$ECHO "*** Warning: Linking the shared library $output against the"
-		$ECHO "*** static library $deplib is not portable!"
-		deplibs="$deplib $deplibs"
 	      fi
 	      ;;
 	    esac
 	    continue
 	    ;;
 	  prog)
-	    if test "$pass" != link; then
+	    if test link != "$pass"; then
 	      deplibs="$deplib $deplibs"
 	    else
 	      compile_deplibs="$deplib $compile_deplibs"
@@ -6366,10 +7811,10 @@ func_mode_link ()
 	  esac # linkmode
 	  ;; # *.$libext
 	*.lo | *.$objext)
-	  if test "$pass" = conv; then
+	  if test conv = "$pass"; then
 	    deplibs="$deplib $deplibs"
-	  elif test "$linkmode" = prog; then
-	    if test "$pass" = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	  elif test prog = "$linkmode"; then
+	    if test dlpreopen = "$pass" || test yes != "$dlopen_support" || test no = "$build_libtool_libs"; then
 	      # If there is no dlopen support or we're linking statically,
 	      # we need to preload.
 	      func_append newdlprefiles " $deplib"
@@ -6382,22 +7827,20 @@ func_mode_link ()
 	  continue
 	  ;;
 	%DEPLIBS%)
-	  alldeplibs=yes
+	  alldeplibs=:
 	  continue
 	  ;;
 	esac # case $deplib
 
-	if test "$found" = yes || test -f "$lib"; then :
-	else
-	  func_fatal_error "cannot find the library \`$lib' or unhandled argument \`$deplib'"
-	fi
+	$found || test -f "$lib" \
+	  || func_fatal_error "cannot find the library '$lib' or unhandled argument '$deplib'"
 
 	# Check to see that this really is a libtool archive.
 	func_lalib_unsafe_p "$lib" \
-	  || func_fatal_error "\`$lib' is not a valid libtool archive"
+	  || func_fatal_error "'$lib' is not a valid libtool archive"
 
 	func_dirname "$lib" "" "."
-	ladir="$func_dirname_result"
+	ladir=$func_dirname_result
 
 	dlname=
 	dlopen=
@@ -6427,30 +7870,30 @@ func_mode_link ()
 	  done
 	fi
 	dependency_libs=`$ECHO " $dependency_libs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
-	if test "$linkmode,$pass" = "lib,link" ||
-	   test "$linkmode,$pass" = "prog,scan" ||
-	   { test "$linkmode" != prog && test "$linkmode" != lib; }; then
+	if test lib,link = "$linkmode,$pass" ||
+	   test prog,scan = "$linkmode,$pass" ||
+	   { test prog != "$linkmode" && test lib != "$linkmode"; }; then
 	  test -n "$dlopen" && func_append dlfiles " $dlopen"
 	  test -n "$dlpreopen" && func_append dlprefiles " $dlpreopen"
 	fi
 
-	if test "$pass" = conv; then
+	if test conv = "$pass"; then
 	  # Only check for convenience libraries
 	  deplibs="$lib $deplibs"
 	  if test -z "$libdir"; then
 	    if test -z "$old_library"; then
-	      func_fatal_error "cannot find name of link library for \`$lib'"
+	      func_fatal_error "cannot find name of link library for '$lib'"
 	    fi
 	    # It is a libtool convenience library, so add in its objects.
 	    func_append convenience " $ladir/$objdir/$old_library"
 	    func_append old_convenience " $ladir/$objdir/$old_library"
-	  elif test "$linkmode" != prog && test "$linkmode" != lib; then
-	    func_fatal_error "\`$lib' is not a convenience library"
+	  elif test prog != "$linkmode" && test lib != "$linkmode"; then
+	    func_fatal_error "'$lib' is not a convenience library"
 	  fi
 	  tmp_libs=
 	  for deplib in $dependency_libs; do
 	    deplibs="$deplib $deplibs"
-	    if $opt_preserve_dup_deps ; then
+	    if $opt_preserve_dup_deps; then
 	      case "$tmp_libs " in
 	      *" $deplib "*) func_append specialdeplibs " $deplib" ;;
 	      esac
@@ -6464,26 +7907,26 @@ func_mode_link ()
 	# Get the name of the library we link against.
 	linklib=
 	if test -n "$old_library" &&
-	   { test "$prefer_static_libs" = yes ||
-	     test "$prefer_static_libs,$installed" = "built,no"; }; then
+	   { test yes = "$prefer_static_libs" ||
+	     test built,no = "$prefer_static_libs,$installed"; }; then
 	  linklib=$old_library
 	else
 	  for l in $old_library $library_names; do
-	    linklib="$l"
+	    linklib=$l
 	  done
 	fi
 	if test -z "$linklib"; then
-	  func_fatal_error "cannot find name of link library for \`$lib'"
+	  func_fatal_error "cannot find name of link library for '$lib'"
 	fi
 
 	# This library was specified with -dlopen.
-	if test "$pass" = dlopen; then
-	  if test -z "$libdir"; then
-	    func_fatal_error "cannot -dlopen a convenience library: \`$lib'"
-	  fi
+	if test dlopen = "$pass"; then
+	  test -z "$libdir" \
+	    && func_fatal_error "cannot -dlopen a convenience library: '$lib'"
 	  if test -z "$dlname" ||
-	     test "$dlopen_support" != yes ||
-	     test "$build_libtool_libs" = no; then
+	     test yes != "$dlopen_support" ||
+	     test no = "$build_libtool_libs"
+	  then
 	    # If there is no dlname, no dlopen support or we're linking
 	    # statically, we need to preload.  We also need to preload any
 	    # dependent libraries so libltdl's deplib preloader doesn't
@@ -6497,40 +7940,40 @@ func_mode_link ()
 
 	# We need an absolute path.
 	case $ladir in
-	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir=$ladir ;;
 	*)
 	  abs_ladir=`cd "$ladir" && pwd`
 	  if test -z "$abs_ladir"; then
-	    func_warning "cannot determine absolute directory name of \`$ladir'"
+	    func_warning "cannot determine absolute directory name of '$ladir'"
 	    func_warning "passing it literally to the linker, although it might fail"
-	    abs_ladir="$ladir"
+	    abs_ladir=$ladir
 	  fi
 	  ;;
 	esac
 	func_basename "$lib"
-	laname="$func_basename_result"
+	laname=$func_basename_result
 
 	# Find the relevant object directory and library name.
-	if test "X$installed" = Xyes; then
+	if test yes = "$installed"; then
 	  if test ! -f "$lt_sysroot$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
-	    func_warning "library \`$lib' was moved."
-	    dir="$ladir"
-	    absdir="$abs_ladir"
-	    libdir="$abs_ladir"
+	    func_warning "library '$lib' was moved."
+	    dir=$ladir
+	    absdir=$abs_ladir
+	    libdir=$abs_ladir
 	  else
-	    dir="$lt_sysroot$libdir"
-	    absdir="$lt_sysroot$libdir"
+	    dir=$lt_sysroot$libdir
+	    absdir=$lt_sysroot$libdir
 	  fi
-	  test "X$hardcode_automatic" = Xyes && avoidtemprpath=yes
+	  test yes = "$hardcode_automatic" && avoidtemprpath=yes
 	else
 	  if test ! -f "$ladir/$objdir/$linklib" && test -f "$abs_ladir/$linklib"; then
-	    dir="$ladir"
-	    absdir="$abs_ladir"
+	    dir=$ladir
+	    absdir=$abs_ladir
 	    # Remove this search path later
 	    func_append notinst_path " $abs_ladir"
 	  else
-	    dir="$ladir/$objdir"
-	    absdir="$abs_ladir/$objdir"
+	    dir=$ladir/$objdir
+	    absdir=$abs_ladir/$objdir
 	    # Remove this search path later
 	    func_append notinst_path " $abs_ladir"
 	  fi
@@ -6539,11 +7982,11 @@ func_mode_link ()
 	name=$func_stripname_result
 
 	# This library was specified with -dlpreopen.
-	if test "$pass" = dlpreopen; then
-	  if test -z "$libdir" && test "$linkmode" = prog; then
-	    func_fatal_error "only libraries may -dlpreopen a convenience library: \`$lib'"
+	if test dlpreopen = "$pass"; then
+	  if test -z "$libdir" && test prog = "$linkmode"; then
+	    func_fatal_error "only libraries may -dlpreopen a convenience library: '$lib'"
 	  fi
-	  case "$host" in
+	  case $host in
 	    # special handling for platforms with PE-DLLs.
 	    *cygwin* | *mingw* | *cegcc* )
 	      # Linker will automatically link against shared library if both
@@ -6587,9 +8030,9 @@ func_mode_link ()
 
 	if test -z "$libdir"; then
 	  # Link the convenience library
-	  if test "$linkmode" = lib; then
+	  if test lib = "$linkmode"; then
 	    deplibs="$dir/$old_library $deplibs"
-	  elif test "$linkmode,$pass" = "prog,link"; then
+	  elif test prog,link = "$linkmode,$pass"; then
 	    compile_deplibs="$dir/$old_library $compile_deplibs"
 	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
 	  else
@@ -6599,14 +8042,14 @@ func_mode_link ()
 	fi
 
 
-	if test "$linkmode" = prog && test "$pass" != link; then
+	if test prog = "$linkmode" && test link != "$pass"; then
 	  func_append newlib_search_path " $ladir"
 	  deplibs="$lib $deplibs"
 
-	  linkalldeplibs=no
-	  if test "$link_all_deplibs" != no || test -z "$library_names" ||
-	     test "$build_libtool_libs" = no; then
-	    linkalldeplibs=yes
+	  linkalldeplibs=false
+	  if test no != "$link_all_deplibs" || test -z "$library_names" ||
+	     test no = "$build_libtool_libs"; then
+	    linkalldeplibs=:
 	  fi
 
 	  tmp_libs=
@@ -6618,14 +8061,14 @@ func_mode_link ()
 		 ;;
 	    esac
 	    # Need to link against all dependency_libs?
-	    if test "$linkalldeplibs" = yes; then
+	    if $linkalldeplibs; then
 	      deplibs="$deplib $deplibs"
 	    else
 	      # Need to hardcode shared library paths
 	      # or/and link against static libraries
 	      newdependency_libs="$deplib $newdependency_libs"
 	    fi
-	    if $opt_preserve_dup_deps ; then
+	    if $opt_preserve_dup_deps; then
 	      case "$tmp_libs " in
 	      *" $deplib "*) func_append specialdeplibs " $deplib" ;;
 	      esac
@@ -6635,15 +8078,15 @@ func_mode_link ()
 	  continue
 	fi # $linkmode = prog...
 
-	if test "$linkmode,$pass" = "prog,link"; then
+	if test prog,link = "$linkmode,$pass"; then
 	  if test -n "$library_names" &&
-	     { { test "$prefer_static_libs" = no ||
-	         test "$prefer_static_libs,$installed" = "built,yes"; } ||
+	     { { test no = "$prefer_static_libs" ||
+	         test built,yes = "$prefer_static_libs,$installed"; } ||
 	       test -z "$old_library"; }; then
 	    # We need to hardcode the library path
-	    if test -n "$shlibpath_var" && test -z "$avoidtemprpath" ; then
+	    if test -n "$shlibpath_var" && test -z "$avoidtemprpath"; then
 	      # Make sure the rpath contains only unique directories.
-	      case "$temp_rpath:" in
+	      case $temp_rpath: in
 	      *"$absdir:"*) ;;
 	      *) func_append temp_rpath "$absdir:" ;;
 	      esac
@@ -6672,9 +8115,9 @@ func_mode_link ()
 	    esac
 	  fi # $linkmode,$pass = prog,link...
 
-	  if test "$alldeplibs" = yes &&
-	     { test "$deplibs_check_method" = pass_all ||
-	       { test "$build_libtool_libs" = yes &&
+	  if $alldeplibs &&
+	     { test pass_all = "$deplibs_check_method" ||
+	       { test yes = "$build_libtool_libs" &&
 		 test -n "$library_names"; }; }; then
 	    # We only need to search for static libraries
 	    continue
@@ -6683,19 +8126,19 @@ func_mode_link ()
 
 	link_static=no # Whether the deplib will be linked statically
 	use_static_libs=$prefer_static_libs
-	if test "$use_static_libs" = built && test "$installed" = yes; then
+	if test built = "$use_static_libs" && test yes = "$installed"; then
 	  use_static_libs=no
 	fi
 	if test -n "$library_names" &&
-	   { test "$use_static_libs" = no || test -z "$old_library"; }; then
+	   { test no = "$use_static_libs" || test -z "$old_library"; }; then
 	  case $host in
-	  *cygwin* | *mingw* | *cegcc*)
+	  *cygwin* | *mingw* | *cegcc* | *os2*)
 	      # No point in relinking DLLs because paths are not encoded
 	      func_append notinst_deplibs " $lib"
 	      need_relink=no
 	    ;;
 	  *)
-	    if test "$installed" = no; then
+	    if test no = "$installed"; then
 	      func_append notinst_deplibs " $lib"
 	      need_relink=yes
 	    fi
@@ -6705,24 +8148,24 @@ func_mode_link ()
 
 	  # Warn about portability, can't link against -module's on some
 	  # systems (darwin).  Don't bleat about dlopened modules though!
-	  dlopenmodule=""
+	  dlopenmodule=
 	  for dlpremoduletest in $dlprefiles; do
 	    if test "X$dlpremoduletest" = "X$lib"; then
-	      dlopenmodule="$dlpremoduletest"
+	      dlopenmodule=$dlpremoduletest
 	      break
 	    fi
 	  done
-	  if test -z "$dlopenmodule" && test "$shouldnotlink" = yes && test "$pass" = link; then
+	  if test -z "$dlopenmodule" && test yes = "$shouldnotlink" && test link = "$pass"; then
 	    echo
-	    if test "$linkmode" = prog; then
+	    if test prog = "$linkmode"; then
 	      $ECHO "*** Warning: Linking the executable $output against the loadable module"
 	    else
 	      $ECHO "*** Warning: Linking the shared library $output against the loadable module"
 	    fi
 	    $ECHO "*** $linklib is not portable!"
 	  fi
-	  if test "$linkmode" = lib &&
-	     test "$hardcode_into_libs" = yes; then
+	  if test lib = "$linkmode" &&
+	     test yes = "$hardcode_into_libs"; then
 	    # Hardcode the library path.
 	    # Skip directories that are in the system default run-time
 	    # search path.
@@ -6750,43 +8193,43 @@ func_mode_link ()
 	    # figure out the soname
 	    set dummy $library_names
 	    shift
-	    realname="$1"
+	    realname=$1
 	    shift
 	    libname=`eval "\\$ECHO \"$libname_spec\""`
 	    # use dlname if we got it. it's perfectly good, no?
 	    if test -n "$dlname"; then
-	      soname="$dlname"
+	      soname=$dlname
 	    elif test -n "$soname_spec"; then
 	      # bleh windows
 	      case $host in
-	      *cygwin* | mingw* | *cegcc*)
+	      *cygwin* | mingw* | *cegcc* | *os2*)
 	        func_arith $current - $age
 		major=$func_arith_result
-		versuffix="-$major"
+		versuffix=-$major
 		;;
 	      esac
 	      eval soname=\"$soname_spec\"
 	    else
-	      soname="$realname"
+	      soname=$realname
 	    fi
 
 	    # Make a new name for the extract_expsyms_cmds to use
-	    soroot="$soname"
+	    soroot=$soname
 	    func_basename "$soroot"
-	    soname="$func_basename_result"
+	    soname=$func_basename_result
 	    func_stripname 'lib' '.dll' "$soname"
 	    newlib=libimp-$func_stripname_result.a
 
 	    # If the library has no export list, then create one now
 	    if test -f "$output_objdir/$soname-def"; then :
 	    else
-	      func_verbose "extracting exported symbol list from \`$soname'"
+	      func_verbose "extracting exported symbol list from '$soname'"
 	      func_execute_cmds "$extract_expsyms_cmds" 'exit $?'
 	    fi
 
 	    # Create $newlib
 	    if test -f "$output_objdir/$newlib"; then :; else
-	      func_verbose "generating import library for \`$soname'"
+	      func_verbose "generating import library for '$soname'"
 	      func_execute_cmds "$old_archive_from_expsyms_cmds" 'exit $?'
 	    fi
 	    # make sure the library variables are pointing to the new library
@@ -6794,58 +8237,58 @@ func_mode_link ()
 	    linklib=$newlib
 	  fi # test -n "$old_archive_from_expsyms_cmds"
 
-	  if test "$linkmode" = prog || test "$opt_mode" != relink; then
+	  if test prog = "$linkmode" || test relink != "$opt_mode"; then
 	    add_shlibpath=
 	    add_dir=
 	    add=
 	    lib_linked=yes
 	    case $hardcode_action in
 	    immediate | unsupported)
-	      if test "$hardcode_direct" = no; then
-		add="$dir/$linklib"
+	      if test no = "$hardcode_direct"; then
+		add=$dir/$linklib
 		case $host in
-		  *-*-sco3.2v5.0.[024]*) add_dir="-L$dir" ;;
-		  *-*-sysv4*uw2*) add_dir="-L$dir" ;;
+		  *-*-sco3.2v5.0.[024]*) add_dir=-L$dir ;;
+		  *-*-sysv4*uw2*) add_dir=-L$dir ;;
 		  *-*-sysv5OpenUNIX* | *-*-sysv5UnixWare7.[01].[10]* | \
-		    *-*-unixware7*) add_dir="-L$dir" ;;
+		    *-*-unixware7*) add_dir=-L$dir ;;
 		  *-*-darwin* )
-		    # if the lib is a (non-dlopened) module then we can not
+		    # if the lib is a (non-dlopened) module then we cannot
 		    # link against it, someone is ignoring the earlier warnings
 		    if /usr/bin/file -L $add 2> /dev/null |
-			 $GREP ": [^:]* bundle" >/dev/null ; then
+			 $GREP ": [^:]* bundle" >/dev/null; then
 		      if test "X$dlopenmodule" != "X$lib"; then
 			$ECHO "*** Warning: lib $linklib is a module, not a shared library"
-			if test -z "$old_library" ; then
+			if test -z "$old_library"; then
 			  echo
 			  echo "*** And there doesn't seem to be a static archive available"
 			  echo "*** The link will probably fail, sorry"
 			else
-			  add="$dir/$old_library"
+			  add=$dir/$old_library
 			fi
 		      elif test -n "$old_library"; then
-			add="$dir/$old_library"
+			add=$dir/$old_library
 		      fi
 		    fi
 		esac
-	      elif test "$hardcode_minus_L" = no; then
+	      elif test no = "$hardcode_minus_L"; then
 		case $host in
-		*-*-sunos*) add_shlibpath="$dir" ;;
+		*-*-sunos*) add_shlibpath=$dir ;;
 		esac
-		add_dir="-L$dir"
-		add="-l$name"
-	      elif test "$hardcode_shlibpath_var" = no; then
-		add_shlibpath="$dir"
-		add="-l$name"
+		add_dir=-L$dir
+		add=-l$name
+	      elif test no = "$hardcode_shlibpath_var"; then
+		add_shlibpath=$dir
+		add=-l$name
 	      else
 		lib_linked=no
 	      fi
 	      ;;
 	    relink)
-	      if test "$hardcode_direct" = yes &&
-	         test "$hardcode_direct_absolute" = no; then
-		add="$dir/$linklib"
-	      elif test "$hardcode_minus_L" = yes; then
-		add_dir="-L$absdir"
+	      if test yes = "$hardcode_direct" &&
+	         test no = "$hardcode_direct_absolute"; then
+		add=$dir/$linklib
+	      elif test yes = "$hardcode_minus_L"; then
+		add_dir=-L$absdir
 		# Try looking first in the location we're being installed to.
 		if test -n "$inst_prefix_dir"; then
 		  case $libdir in
@@ -6854,10 +8297,10 @@ func_mode_link ()
 		      ;;
 		  esac
 		fi
-		add="-l$name"
-	      elif test "$hardcode_shlibpath_var" = yes; then
-		add_shlibpath="$dir"
-		add="-l$name"
+		add=-l$name
+	      elif test yes = "$hardcode_shlibpath_var"; then
+		add_shlibpath=$dir
+		add=-l$name
 	      else
 		lib_linked=no
 	      fi
@@ -6865,7 +8308,7 @@ func_mode_link ()
 	    *) lib_linked=no ;;
 	    esac
 
-	    if test "$lib_linked" != yes; then
+	    if test yes != "$lib_linked"; then
 	      func_fatal_configuration "unsupported hardcode properties"
 	    fi
 
@@ -6875,15 +8318,15 @@ func_mode_link ()
 	      *) func_append compile_shlibpath "$add_shlibpath:" ;;
 	      esac
 	    fi
-	    if test "$linkmode" = prog; then
+	    if test prog = "$linkmode"; then
 	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
 	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
 	    else
 	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
 	      test -n "$add" && deplibs="$add $deplibs"
-	      if test "$hardcode_direct" != yes &&
-		 test "$hardcode_minus_L" != yes &&
-		 test "$hardcode_shlibpath_var" = yes; then
+	      if test yes != "$hardcode_direct" &&
+		 test yes != "$hardcode_minus_L" &&
+		 test yes = "$hardcode_shlibpath_var"; then
 		case :$finalize_shlibpath: in
 		*":$libdir:"*) ;;
 		*) func_append finalize_shlibpath "$libdir:" ;;
@@ -6892,33 +8335,33 @@ func_mode_link ()
 	    fi
 	  fi
 
-	  if test "$linkmode" = prog || test "$opt_mode" = relink; then
+	  if test prog = "$linkmode" || test relink = "$opt_mode"; then
 	    add_shlibpath=
 	    add_dir=
 	    add=
 	    # Finalize command for both is simple: just hardcode it.
-	    if test "$hardcode_direct" = yes &&
-	       test "$hardcode_direct_absolute" = no; then
-	      add="$libdir/$linklib"
-	    elif test "$hardcode_minus_L" = yes; then
-	      add_dir="-L$libdir"
-	      add="-l$name"
-	    elif test "$hardcode_shlibpath_var" = yes; then
+	    if test yes = "$hardcode_direct" &&
+	       test no = "$hardcode_direct_absolute"; then
+	      add=$libdir/$linklib
+	    elif test yes = "$hardcode_minus_L"; then
+	      add_dir=-L$libdir
+	      add=-l$name
+	    elif test yes = "$hardcode_shlibpath_var"; then
 	      case :$finalize_shlibpath: in
 	      *":$libdir:"*) ;;
 	      *) func_append finalize_shlibpath "$libdir:" ;;
 	      esac
-	      add="-l$name"
-	    elif test "$hardcode_automatic" = yes; then
+	      add=-l$name
+	    elif test yes = "$hardcode_automatic"; then
 	      if test -n "$inst_prefix_dir" &&
-		 test -f "$inst_prefix_dir$libdir/$linklib" ; then
-		add="$inst_prefix_dir$libdir/$linklib"
+		 test -f "$inst_prefix_dir$libdir/$linklib"; then
+		add=$inst_prefix_dir$libdir/$linklib
 	      else
-		add="$libdir/$linklib"
+		add=$libdir/$linklib
 	      fi
 	    else
 	      # We cannot seem to hardcode it, guess we'll fake it.
-	      add_dir="-L$libdir"
+	      add_dir=-L$libdir
 	      # Try looking first in the location we're being installed to.
 	      if test -n "$inst_prefix_dir"; then
 		case $libdir in
@@ -6927,10 +8370,10 @@ func_mode_link ()
 		    ;;
 		esac
 	      fi
-	      add="-l$name"
+	      add=-l$name
 	    fi
 
-	    if test "$linkmode" = prog; then
+	    if test prog = "$linkmode"; then
 	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
 	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
 	    else
@@ -6938,43 +8381,43 @@ func_mode_link ()
 	      test -n "$add" && deplibs="$add $deplibs"
 	    fi
 	  fi
-	elif test "$linkmode" = prog; then
+	elif test prog = "$linkmode"; then
 	  # Here we assume that one of hardcode_direct or hardcode_minus_L
 	  # is not unsupported.  This is valid on all known static and
 	  # shared platforms.
-	  if test "$hardcode_direct" != unsupported; then
-	    test -n "$old_library" && linklib="$old_library"
+	  if test unsupported != "$hardcode_direct"; then
+	    test -n "$old_library" && linklib=$old_library
 	    compile_deplibs="$dir/$linklib $compile_deplibs"
 	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
 	  else
 	    compile_deplibs="-l$name -L$dir $compile_deplibs"
 	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
 	  fi
-	elif test "$build_libtool_libs" = yes; then
+	elif test yes = "$build_libtool_libs"; then
 	  # Not a shared library
-	  if test "$deplibs_check_method" != pass_all; then
+	  if test pass_all != "$deplibs_check_method"; then
 	    # We're trying link a shared library against a static one
 	    # but the system doesn't support it.
 
 	    # Just print a warning and add the library to dependency_libs so
 	    # that the program can be linked against the static library.
 	    echo
-	    $ECHO "*** Warning: This system can not link to static lib archive $lib."
+	    $ECHO "*** Warning: This system cannot link to static lib archive $lib."
 	    echo "*** I have the capability to make that library automatically link in when"
 	    echo "*** you link to this library.  But I can only do this if you have a"
 	    echo "*** shared version of the library, which you do not appear to have."
-	    if test "$module" = yes; then
+	    if test yes = "$module"; then
 	      echo "*** But as you try to build a module library, libtool will still create "
 	      echo "*** a static module, that should work as long as the dlopening application"
 	      echo "*** is linked with the -dlopen flag to resolve symbols at runtime."
 	      if test -z "$global_symbol_pipe"; then
 		echo
 		echo "*** However, this would only work if libtool was able to extract symbol"
-		echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+		echo "*** lists from a program, using 'nm' or equivalent, but libtool could"
 		echo "*** not find such a program.  So, this module is probably useless."
-		echo "*** \`nm' from GNU binutils and a full rebuild may help."
+		echo "*** 'nm' from GNU binutils and a full rebuild may help."
 	      fi
-	      if test "$build_old_libs" = no; then
+	      if test no = "$build_old_libs"; then
 		build_libtool_libs=module
 		build_old_libs=yes
 	      else
@@ -6987,11 +8430,11 @@ func_mode_link ()
 	  fi
 	fi # link shared/static library?
 
-	if test "$linkmode" = lib; then
+	if test lib = "$linkmode"; then
 	  if test -n "$dependency_libs" &&
-	     { test "$hardcode_into_libs" != yes ||
-	       test "$build_old_libs" = yes ||
-	       test "$link_static" = yes; }; then
+	     { test yes != "$hardcode_into_libs" ||
+	       test yes = "$build_old_libs" ||
+	       test yes = "$link_static"; }; then
 	    # Extract -R from dependency_libs
 	    temp_deplibs=
 	    for libdir in $dependency_libs; do
@@ -7005,12 +8448,12 @@ func_mode_link ()
 	      *) func_append temp_deplibs " $libdir";;
 	      esac
 	    done
-	    dependency_libs="$temp_deplibs"
+	    dependency_libs=$temp_deplibs
 	  fi
 
 	  func_append newlib_search_path " $absdir"
 	  # Link against this library
-	  test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+	  test no = "$link_static" && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
 	  # ... and its dependency_libs
 	  tmp_libs=
 	  for deplib in $dependency_libs; do
@@ -7020,7 +8463,7 @@ func_mode_link ()
                    func_resolve_sysroot "$func_stripname_result";;
               *) func_resolve_sysroot "$deplib" ;;
             esac
-	    if $opt_preserve_dup_deps ; then
+	    if $opt_preserve_dup_deps; then
 	      case "$tmp_libs " in
 	      *" $func_resolve_sysroot_result "*)
                 func_append specialdeplibs " $func_resolve_sysroot_result" ;;
@@ -7029,12 +8472,12 @@ func_mode_link ()
 	    func_append tmp_libs " $func_resolve_sysroot_result"
 	  done
 
-	  if test "$link_all_deplibs" != no; then
+	  if test no != "$link_all_deplibs"; then
 	    # Add the search paths of all dependency libraries
 	    for deplib in $dependency_libs; do
 	      path=
 	      case $deplib in
-	      -L*) path="$deplib" ;;
+	      -L*) path=$deplib ;;
 	      *.la)
 	        func_resolve_sysroot "$deplib"
 	        deplib=$func_resolve_sysroot_result
@@ -7042,12 +8485,12 @@ func_mode_link ()
 		dir=$func_dirname_result
 		# We need an absolute path.
 		case $dir in
-		[\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+		[\\/]* | [A-Za-z]:[\\/]*) absdir=$dir ;;
 		*)
 		  absdir=`cd "$dir" && pwd`
 		  if test -z "$absdir"; then
-		    func_warning "cannot determine absolute directory name of \`$dir'"
-		    absdir="$dir"
+		    func_warning "cannot determine absolute directory name of '$dir'"
+		    absdir=$dir
 		  fi
 		  ;;
 		esac
@@ -7055,35 +8498,35 @@ func_mode_link ()
 		case $host in
 		*-*-darwin*)
 		  depdepl=
-		  eval deplibrary_names=`${SED} -n -e 's/^library_names=\(.*\)$/\1/p' $deplib`
-		  if test -n "$deplibrary_names" ; then
-		    for tmp in $deplibrary_names ; do
+		  eval deplibrary_names=`$SED -n -e 's/^library_names=\(.*\)$/\1/p' $deplib`
+		  if test -n "$deplibrary_names"; then
+		    for tmp in $deplibrary_names; do
 		      depdepl=$tmp
 		    done
-		    if test -f "$absdir/$objdir/$depdepl" ; then
-		      depdepl="$absdir/$objdir/$depdepl"
-		      darwin_install_name=`${OTOOL} -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'`
+		    if test -f "$absdir/$objdir/$depdepl"; then
+		      depdepl=$absdir/$objdir/$depdepl
+		      darwin_install_name=`$OTOOL -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'`
                       if test -z "$darwin_install_name"; then
-                          darwin_install_name=`${OTOOL64} -L $depdepl  | awk '{if (NR == 2) {print $1;exit}}'`
+                          darwin_install_name=`$OTOOL64 -L $depdepl  | awk '{if (NR == 2) {print $1;exit}}'`
                       fi
-		      func_append compiler_flags " ${wl}-dylib_file ${wl}${darwin_install_name}:${depdepl}"
-		      func_append linker_flags " -dylib_file ${darwin_install_name}:${depdepl}"
+		      func_append compiler_flags " $wl-dylib_file $wl$darwin_install_name:$depdepl"
+		      func_append linker_flags " -dylib_file $darwin_install_name:$depdepl"
 		      path=
 		    fi
 		  fi
 		  ;;
 		*)
-		  path="-L$absdir/$objdir"
+		  path=-L$absdir/$objdir
 		  ;;
 		esac
 		else
-		  eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		  eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
 		  test -z "$libdir" && \
-		    func_fatal_error "\`$deplib' is not a valid libtool archive"
+		    func_fatal_error "'$deplib' is not a valid libtool archive"
 		  test "$absdir" != "$libdir" && \
-		    func_warning "\`$deplib' seems to be moved"
+		    func_warning "'$deplib' seems to be moved"
 
-		  path="-L$absdir"
+		  path=-L$absdir
 		fi
 		;;
 	      esac
@@ -7095,23 +8538,23 @@ func_mode_link ()
 	  fi # link_all_deplibs != no
 	fi # linkmode = lib
       done # for deplib in $libs
-      if test "$pass" = link; then
-	if test "$linkmode" = "prog"; then
+      if test link = "$pass"; then
+	if test prog = "$linkmode"; then
 	  compile_deplibs="$new_inherited_linker_flags $compile_deplibs"
 	  finalize_deplibs="$new_inherited_linker_flags $finalize_deplibs"
 	else
 	  compiler_flags="$compiler_flags "`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
 	fi
       fi
-      dependency_libs="$newdependency_libs"
-      if test "$pass" = dlpreopen; then
+      dependency_libs=$newdependency_libs
+      if test dlpreopen = "$pass"; then
 	# Link the dlpreopened libraries before other libraries
 	for deplib in $save_deplibs; do
 	  deplibs="$deplib $deplibs"
 	done
       fi
-      if test "$pass" != dlopen; then
-	if test "$pass" != conv; then
+      if test dlopen != "$pass"; then
+	test conv = "$pass" || {
 	  # Make sure lib_search_path contains only unique directories.
 	  lib_search_path=
 	  for dir in $newlib_search_path; do
@@ -7121,12 +8564,12 @@ func_mode_link ()
 	    esac
 	  done
 	  newlib_search_path=
-	fi
+	}
 
-	if test "$linkmode,$pass" != "prog,link"; then
-	  vars="deplibs"
-	else
+	if test prog,link = "$linkmode,$pass"; then
 	  vars="compile_deplibs finalize_deplibs"
+	else
+	  vars=deplibs
 	fi
 	for var in $vars dependency_libs; do
 	  # Add libraries to $var in reverse order
@@ -7184,62 +8627,93 @@ func_mode_link ()
 	  eval $var=\"$tmp_libs\"
 	done # for var
       fi
+
+      # Add Sun CC postdeps if required:
+      test CXX = "$tagname" && {
+        case $host_os in
+        linux*)
+          case `$CC -V 2>&1 | sed 5q` in
+          *Sun\ C*) # Sun C++ 5.9
+            func_suncc_cstd_abi
+
+            if test no != "$suncc_use_cstd_abi"; then
+              func_append postdeps ' -library=Cstd -library=Crun'
+            fi
+            ;;
+          esac
+          ;;
+
+        solaris*)
+          func_cc_basename "$CC"
+          case $func_cc_basename_result in
+          CC* | sunCC*)
+            func_suncc_cstd_abi
+
+            if test no != "$suncc_use_cstd_abi"; then
+              func_append postdeps ' -library=Cstd -library=Crun'
+            fi
+            ;;
+          esac
+          ;;
+        esac
+      }
+
       # Last step: remove runtime libs from dependency_libs
       # (they stay in deplibs)
       tmp_libs=
-      for i in $dependency_libs ; do
+      for i in $dependency_libs; do
 	case " $predeps $postdeps $compiler_lib_search_path " in
 	*" $i "*)
-	  i=""
+	  i=
 	  ;;
 	esac
-	if test -n "$i" ; then
+	if test -n "$i"; then
 	  func_append tmp_libs " $i"
 	fi
       done
       dependency_libs=$tmp_libs
     done # for pass
-    if test "$linkmode" = prog; then
-      dlfiles="$newdlfiles"
+    if test prog = "$linkmode"; then
+      dlfiles=$newdlfiles
     fi
-    if test "$linkmode" = prog || test "$linkmode" = lib; then
-      dlprefiles="$newdlprefiles"
+    if test prog = "$linkmode" || test lib = "$linkmode"; then
+      dlprefiles=$newdlprefiles
     fi
 
     case $linkmode in
     oldlib)
-      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
-	func_warning "\`-dlopen' is ignored for archives"
+      if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
+	func_warning "'-dlopen' is ignored for archives"
       fi
 
       case " $deplibs" in
       *\ -l* | *\ -L*)
-	func_warning "\`-l' and \`-L' are ignored for archives" ;;
+	func_warning "'-l' and '-L' are ignored for archives" ;;
       esac
 
       test -n "$rpath" && \
-	func_warning "\`-rpath' is ignored for archives"
+	func_warning "'-rpath' is ignored for archives"
 
       test -n "$xrpath" && \
-	func_warning "\`-R' is ignored for archives"
+	func_warning "'-R' is ignored for archives"
 
       test -n "$vinfo" && \
-	func_warning "\`-version-info/-version-number' is ignored for archives"
+	func_warning "'-version-info/-version-number' is ignored for archives"
 
       test -n "$release" && \
-	func_warning "\`-release' is ignored for archives"
+	func_warning "'-release' is ignored for archives"
 
       test -n "$export_symbols$export_symbols_regex" && \
-	func_warning "\`-export-symbols' is ignored for archives"
+	func_warning "'-export-symbols' is ignored for archives"
 
       # Now set the variables for building old libraries.
       build_libtool_libs=no
-      oldlibs="$output"
+      oldlibs=$output
       func_append objs "$old_deplibs"
       ;;
 
     lib)
-      # Make sure we only generate libraries of the form `libNAME.la'.
+      # Make sure we only generate libraries of the form 'libNAME.la'.
       case $outputname in
       lib*)
 	func_stripname 'lib' '.la' "$outputname"
@@ -7248,10 +8722,10 @@ func_mode_link ()
 	eval libname=\"$libname_spec\"
 	;;
       *)
-	test "$module" = no && \
-	  func_fatal_help "libtool library \`$output' must begin with \`lib'"
+	test no = "$module" \
+	  && func_fatal_help "libtool library '$output' must begin with 'lib'"
 
-	if test "$need_lib_prefix" != no; then
+	if test no != "$need_lib_prefix"; then
 	  # Add the "lib" prefix for modules if required
 	  func_stripname '' '.la' "$outputname"
 	  name=$func_stripname_result
@@ -7265,8 +8739,8 @@ func_mode_link ()
       esac
 
       if test -n "$objs"; then
-	if test "$deplibs_check_method" != pass_all; then
-	  func_fatal_error "cannot build libtool library \`$output' from non-libtool objects on this host:$objs"
+	if test pass_all != "$deplibs_check_method"; then
+	  func_fatal_error "cannot build libtool library '$output' from non-libtool objects on this host:$objs"
 	else
 	  echo
 	  $ECHO "*** Warning: Linking the shared library $output against the non-libtool"
@@ -7275,21 +8749,21 @@ func_mode_link ()
 	fi
       fi
 
-      test "$dlself" != no && \
-	func_warning "\`-dlopen self' is ignored for libtool libraries"
+      test no = "$dlself" \
+	|| func_warning "'-dlopen self' is ignored for libtool libraries"
 
       set dummy $rpath
       shift
-      test "$#" -gt 1 && \
-	func_warning "ignoring multiple \`-rpath's for a libtool library"
+      test 1 -lt "$#" \
+	&& func_warning "ignoring multiple '-rpath's for a libtool library"
 
-      install_libdir="$1"
+      install_libdir=$1
 
       oldlibs=
       if test -z "$rpath"; then
-	if test "$build_libtool_libs" = yes; then
+	if test yes = "$build_libtool_libs"; then
 	  # Building a libtool convenience library.
-	  # Some compilers have problems with a `.al' extension so
+	  # Some compilers have problems with a '.al' extension so
 	  # convenience libraries should have the same extension an
 	  # archive normally would.
 	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
@@ -7298,20 +8772,20 @@ func_mode_link ()
 	fi
 
 	test -n "$vinfo" && \
-	  func_warning "\`-version-info/-version-number' is ignored for convenience libraries"
+	  func_warning "'-version-info/-version-number' is ignored for convenience libraries"
 
 	test -n "$release" && \
-	  func_warning "\`-release' is ignored for convenience libraries"
+	  func_warning "'-release' is ignored for convenience libraries"
       else
 
 	# Parse the version information argument.
-	save_ifs="$IFS"; IFS=':'
+	save_ifs=$IFS; IFS=:
 	set dummy $vinfo 0 0 0
 	shift
-	IFS="$save_ifs"
+	IFS=$save_ifs
 
 	test -n "$7" && \
-	  func_fatal_help "too many parameters to \`-version-info'"
+	  func_fatal_help "too many parameters to '-version-info'"
 
 	# convert absolute version numbers to libtool ages
 	# this retains compatibility with .la files and attempts
@@ -7319,42 +8793,42 @@ func_mode_link ()
 
 	case $vinfo_number in
 	yes)
-	  number_major="$1"
-	  number_minor="$2"
-	  number_revision="$3"
+	  number_major=$1
+	  number_minor=$2
+	  number_revision=$3
 	  #
 	  # There are really only two kinds -- those that
 	  # use the current revision as the major version
 	  # and those that subtract age and use age as
 	  # a minor version.  But, then there is irix
-	  # which has an extra 1 added just for fun
+	  # that has an extra 1 added just for fun
 	  #
 	  case $version_type in
 	  # correct linux to gnu/linux during the next big refactor
-	  darwin|linux|osf|windows|none)
+	  darwin|freebsd-elf|linux|osf|windows|none)
 	    func_arith $number_major + $number_minor
 	    current=$func_arith_result
-	    age="$number_minor"
-	    revision="$number_revision"
+	    age=$number_minor
+	    revision=$number_revision
 	    ;;
-	  freebsd-aout|freebsd-elf|qnx|sunos)
-	    current="$number_major"
-	    revision="$number_minor"
-	    age="0"
+	  freebsd-aout|qnx|sunos)
+	    current=$number_major
+	    revision=$number_minor
+	    age=0
 	    ;;
 	  irix|nonstopux)
 	    func_arith $number_major + $number_minor
 	    current=$func_arith_result
-	    age="$number_minor"
-	    revision="$number_minor"
+	    age=$number_minor
+	    revision=$number_minor
 	    lt_irix_increment=no
 	    ;;
 	  esac
 	  ;;
 	no)
-	  current="$1"
-	  revision="$2"
-	  age="$3"
+	  current=$1
+	  revision=$2
+	  age=$3
 	  ;;
 	esac
 
@@ -7362,30 +8836,30 @@ func_mode_link ()
 	case $current in
 	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
 	*)
-	  func_error "CURRENT \`$current' must be a nonnegative integer"
-	  func_fatal_error "\`$vinfo' is not valid version information"
+	  func_error "CURRENT '$current' must be a nonnegative integer"
+	  func_fatal_error "'$vinfo' is not valid version information"
 	  ;;
 	esac
 
 	case $revision in
 	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
 	*)
-	  func_error "REVISION \`$revision' must be a nonnegative integer"
-	  func_fatal_error "\`$vinfo' is not valid version information"
+	  func_error "REVISION '$revision' must be a nonnegative integer"
+	  func_fatal_error "'$vinfo' is not valid version information"
 	  ;;
 	esac
 
 	case $age in
 	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
 	*)
-	  func_error "AGE \`$age' must be a nonnegative integer"
-	  func_fatal_error "\`$vinfo' is not valid version information"
+	  func_error "AGE '$age' must be a nonnegative integer"
+	  func_fatal_error "'$vinfo' is not valid version information"
 	  ;;
 	esac
 
 	if test "$age" -gt "$current"; then
-	  func_error "AGE \`$age' is greater than the current interface number \`$current'"
-	  func_fatal_error "\`$vinfo' is not valid version information"
+	  func_error "AGE '$age' is greater than the current interface number '$current'"
+	  func_fatal_error "'$vinfo' is not valid version information"
 	fi
 
 	# Calculate the version variables.
@@ -7400,26 +8874,36 @@ func_mode_link ()
 	  # verstring for coding it into the library header
 	  func_arith $current - $age
 	  major=.$func_arith_result
-	  versuffix="$major.$age.$revision"
+	  versuffix=$major.$age.$revision
 	  # Darwin ld doesn't like 0 for these options...
 	  func_arith $current + 1
 	  minor_current=$func_arith_result
-	  xlcverstring="${wl}-compatibility_version ${wl}$minor_current ${wl}-current_version ${wl}$minor_current.$revision"
+	  xlcverstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision"
 	  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+          # On Darwin other compilers
+          case $CC in
+              nagfor*)
+                  verstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision"
+                  ;;
+              *)
+                  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+                  ;;
+          esac
 	  ;;
 
 	freebsd-aout)
-	  major=".$current"
-	  versuffix=".$current.$revision";
+	  major=.$current
+	  versuffix=.$current.$revision
 	  ;;
 
 	freebsd-elf)
-	  major=".$current"
-	  versuffix=".$current"
+	  func_arith $current - $age
+	  major=.$func_arith_result
+	  versuffix=$major.$age.$revision
 	  ;;
 
 	irix | nonstopux)
-	  if test "X$lt_irix_increment" = "Xno"; then
+	  if test no = "$lt_irix_increment"; then
 	    func_arith $current - $age
 	  else
 	    func_arith $current - $age + 1
@@ -7430,69 +8914,74 @@ func_mode_link ()
 	    nonstopux) verstring_prefix=nonstopux ;;
 	    *)         verstring_prefix=sgi ;;
 	  esac
-	  verstring="$verstring_prefix$major.$revision"
+	  verstring=$verstring_prefix$major.$revision
 
 	  # Add in all the interfaces that we are compatible with.
 	  loop=$revision
-	  while test "$loop" -ne 0; do
+	  while test 0 -ne "$loop"; do
 	    func_arith $revision - $loop
 	    iface=$func_arith_result
 	    func_arith $loop - 1
 	    loop=$func_arith_result
-	    verstring="$verstring_prefix$major.$iface:$verstring"
+	    verstring=$verstring_prefix$major.$iface:$verstring
 	  done
 
-	  # Before this point, $major must not contain `.'.
+	  # Before this point, $major must not contain '.'.
 	  major=.$major
-	  versuffix="$major.$revision"
+	  versuffix=$major.$revision
 	  ;;
 
 	linux) # correct to gnu/linux during the next big refactor
 	  func_arith $current - $age
 	  major=.$func_arith_result
-	  versuffix="$major.$age.$revision"
+	  versuffix=$major.$age.$revision
 	  ;;
 
 	osf)
 	  func_arith $current - $age
 	  major=.$func_arith_result
-	  versuffix=".$current.$age.$revision"
-	  verstring="$current.$age.$revision"
+	  versuffix=.$current.$age.$revision
+	  verstring=$current.$age.$revision
 
 	  # Add in all the interfaces that we are compatible with.
 	  loop=$age
-	  while test "$loop" -ne 0; do
+	  while test 0 -ne "$loop"; do
 	    func_arith $current - $loop
 	    iface=$func_arith_result
 	    func_arith $loop - 1
 	    loop=$func_arith_result
-	    verstring="$verstring:${iface}.0"
+	    verstring=$verstring:$iface.0
 	  done
 
 	  # Make executables depend on our current version.
-	  func_append verstring ":${current}.0"
+	  func_append verstring ":$current.0"
 	  ;;
 
 	qnx)
-	  major=".$current"
-	  versuffix=".$current"
+	  major=.$current
+	  versuffix=.$current
+	  ;;
+
+	sco)
+	  major=.$current
+	  versuffix=.$current
 	  ;;
 
 	sunos)
-	  major=".$current"
-	  versuffix=".$current.$revision"
+	  major=.$current
+	  versuffix=.$current.$revision
 	  ;;
 
 	windows)
 	  # Use '-' rather than '.', since we only want one
-	  # extension on DOS 8.3 filesystems.
+	  # extension on DOS 8.3 file systems.
 	  func_arith $current - $age
 	  major=$func_arith_result
-	  versuffix="-$major"
+	  versuffix=-$major
 	  ;;
 
 	*)
-	  func_fatal_configuration "unknown library version type \`$version_type'"
+	  func_fatal_configuration "unknown library version type '$version_type'"
 	  ;;
 	esac
 
@@ -7506,42 +8995,45 @@ func_mode_link ()
 	    verstring=
 	    ;;
 	  *)
-	    verstring="0.0"
+	    verstring=0.0
 	    ;;
 	  esac
-	  if test "$need_version" = no; then
+	  if test no = "$need_version"; then
 	    versuffix=
 	  else
-	    versuffix=".0.0"
+	    versuffix=.0.0
 	  fi
 	fi
 
 	# Remove version info from name if versioning should be avoided
-	if test "$avoid_version" = yes && test "$need_version" = no; then
+	if test yes,no = "$avoid_version,$need_version"; then
 	  major=
 	  versuffix=
-	  verstring=""
+	  verstring=
 	fi
 
 	# Check to see if the archive will have undefined symbols.
-	if test "$allow_undefined" = yes; then
-	  if test "$allow_undefined_flag" = unsupported; then
-	    func_warning "undefined symbols not allowed in $host shared libraries"
-	    build_libtool_libs=no
-	    build_old_libs=yes
+	if test yes = "$allow_undefined"; then
+	  if test unsupported = "$allow_undefined_flag"; then
+	    if test yes = "$build_old_libs"; then
+	      func_warning "undefined symbols not allowed in $host shared libraries; building static only"
+	      build_libtool_libs=no
+	    else
+	      func_fatal_error "can't build $host shared library unless -no-undefined is specified"
+	    fi
 	  fi
 	else
 	  # Don't allow undefined symbols.
-	  allow_undefined_flag="$no_undefined_flag"
+	  allow_undefined_flag=$no_undefined_flag
 	fi
 
       fi
 
-      func_generate_dlsyms "$libname" "$libname" "yes"
+      func_generate_dlsyms "$libname" "$libname" :
       func_append libobjs " $symfileobj"
-      test "X$libobjs" = "X " && libobjs=
+      test " " = "$libobjs" && libobjs=
 
-      if test "$opt_mode" != relink; then
+      if test relink != "$opt_mode"; then
 	# Remove our outputs, but don't remove object files since they
 	# may have been created when compiling PIC objects.
 	removelist=
@@ -7550,8 +9042,8 @@ func_mode_link ()
 	  case $p in
 	    *.$objext | *.gcno)
 	       ;;
-	    $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/${libname}${release}.*)
-	       if test "X$precious_files_regex" != "X"; then
+	    $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/$libname$release.*)
+	       if test -n "$precious_files_regex"; then
 		 if $ECHO "$p" | $EGREP -e "$precious_files_regex" >/dev/null 2>&1
 		 then
 		   continue
@@ -7567,11 +9059,11 @@ func_mode_link ()
       fi
 
       # Now set the variables for building old libraries.
-      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+      if test yes = "$build_old_libs" && test convenience != "$build_libtool_libs"; then
 	func_append oldlibs " $output_objdir/$libname.$libext"
 
 	# Transform .lo files to .o files.
-	oldobjs="$objs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.${libext}$/d; $lo2o" | $NL2SP`
+	oldobjs="$objs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; $lo2o" | $NL2SP`
       fi
 
       # Eliminate all temporary directories.
@@ -7592,13 +9084,13 @@ func_mode_link ()
 	  *) func_append finalize_rpath " $libdir" ;;
 	  esac
 	done
-	if test "$hardcode_into_libs" != yes || test "$build_old_libs" = yes; then
+	if test yes != "$hardcode_into_libs" || test yes = "$build_old_libs"; then
 	  dependency_libs="$temp_xrpath $dependency_libs"
 	fi
       fi
 
       # Make sure dlfiles contains only unique files that won't be dlpreopened
-      old_dlfiles="$dlfiles"
+      old_dlfiles=$dlfiles
       dlfiles=
       for lib in $old_dlfiles; do
 	case " $dlprefiles $dlfiles " in
@@ -7608,7 +9100,7 @@ func_mode_link ()
       done
 
       # Make sure dlprefiles contains only unique files
-      old_dlprefiles="$dlprefiles"
+      old_dlprefiles=$dlprefiles
       dlprefiles=
       for lib in $old_dlprefiles; do
 	case "$dlprefiles " in
@@ -7617,7 +9109,7 @@ func_mode_link ()
 	esac
       done
 
-      if test "$build_libtool_libs" = yes; then
+      if test yes = "$build_libtool_libs"; then
 	if test -n "$rpath"; then
 	  case $host in
 	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos* | *-cegcc* | *-*-haiku*)
@@ -7641,7 +9133,7 @@ func_mode_link ()
 	    ;;
 	  *)
 	    # Add libc to deplibs on all other systems if necessary.
-	    if test "$build_libtool_need_lc" = "yes"; then
+	    if test yes = "$build_libtool_need_lc"; then
 	      func_append deplibs " -lc"
 	    fi
 	    ;;
@@ -7657,9 +9149,9 @@ func_mode_link ()
 	# I'm not sure if I'm treating the release correctly.  I think
 	# release should show up in the -l (ie -lgmp5) so we don't want to
 	# add it in twice.  Is that correct?
-	release=""
-	versuffix=""
-	major=""
+	release=
+	versuffix=
+	major=
 	newdeplibs=
 	droppeddeps=no
 	case $deplibs_check_method in
@@ -7688,20 +9180,20 @@ EOF
 	      -l*)
 		func_stripname -l '' "$i"
 		name=$func_stripname_result
-		if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		if test yes = "$allow_libtool_libs_with_static_runtimes"; then
 		  case " $predeps $postdeps " in
 		  *" $i "*)
 		    func_append newdeplibs " $i"
-		    i=""
+		    i=
 		    ;;
 		  esac
 		fi
-		if test -n "$i" ; then
+		if test -n "$i"; then
 		  libname=`eval "\\$ECHO \"$libname_spec\""`
 		  deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
 		  set dummy $deplib_matches; shift
 		  deplib_match=$1
-		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0; then
 		    func_append newdeplibs " $i"
 		  else
 		    droppeddeps=yes
@@ -7731,20 +9223,20 @@ EOF
 		$opt_dry_run || $RM conftest
 		if $LTCC $LTCFLAGS -o conftest conftest.c $i; then
 		  ldd_output=`ldd conftest`
-		  if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+		  if test yes = "$allow_libtool_libs_with_static_runtimes"; then
 		    case " $predeps $postdeps " in
 		    *" $i "*)
 		      func_append newdeplibs " $i"
-		      i=""
+		      i=
 		      ;;
 		    esac
 		  fi
-		  if test -n "$i" ; then
+		  if test -n "$i"; then
 		    libname=`eval "\\$ECHO \"$libname_spec\""`
 		    deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
 		    set dummy $deplib_matches; shift
 		    deplib_match=$1
-		    if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		    if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0; then
 		      func_append newdeplibs " $i"
 		    else
 		      droppeddeps=yes
@@ -7781,24 +9273,24 @@ EOF
 	    -l*)
 	      func_stripname -l '' "$a_deplib"
 	      name=$func_stripname_result
-	      if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+	      if test yes = "$allow_libtool_libs_with_static_runtimes"; then
 		case " $predeps $postdeps " in
 		*" $a_deplib "*)
 		  func_append newdeplibs " $a_deplib"
-		  a_deplib=""
+		  a_deplib=
 		  ;;
 		esac
 	      fi
-	      if test -n "$a_deplib" ; then
+	      if test -n "$a_deplib"; then
 		libname=`eval "\\$ECHO \"$libname_spec\""`
 		if test -n "$file_magic_glob"; then
 		  libnameglob=`func_echo_all "$libname" | $SED -e $file_magic_glob`
 		else
 		  libnameglob=$libname
 		fi
-		test "$want_nocaseglob" = yes && nocaseglob=`shopt -p nocaseglob`
+		test yes = "$want_nocaseglob" && nocaseglob=`shopt -p nocaseglob`
 		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
-		  if test "$want_nocaseglob" = yes; then
+		  if test yes = "$want_nocaseglob"; then
 		    shopt -s nocaseglob
 		    potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null`
 		    $nocaseglob
@@ -7816,25 +9308,25 @@ EOF
 		      # We might still enter an endless loop, since a link
 		      # loop can be closed while we follow links,
 		      # but so what?
-		      potlib="$potent_lib"
+		      potlib=$potent_lib
 		      while test -h "$potlib" 2>/dev/null; do
-			potliblink=`ls -ld $potlib | ${SED} 's/.* -> //'`
+			potliblink=`ls -ld $potlib | $SED 's/.* -> //'`
 			case $potliblink in
-			[\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
-			*) potlib=`$ECHO "$potlib" | $SED 's,[^/]*$,,'`"$potliblink";;
+			[\\/]* | [A-Za-z]:[\\/]*) potlib=$potliblink;;
+			*) potlib=`$ECHO "$potlib" | $SED 's|[^/]*$||'`"$potliblink";;
 			esac
 		      done
 		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null |
 			 $SED -e 10q |
 			 $EGREP "$file_magic_regex" > /dev/null; then
 			func_append newdeplibs " $a_deplib"
-			a_deplib=""
+			a_deplib=
 			break 2
 		      fi
 		  done
 		done
 	      fi
-	      if test -n "$a_deplib" ; then
+	      if test -n "$a_deplib"; then
 		droppeddeps=yes
 		echo
 		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
@@ -7842,7 +9334,7 @@ EOF
 		echo "*** you link to this library.  But I can only do this if you have a"
 		echo "*** shared version of the library, which you do not appear to have"
 		echo "*** because I did check the linker path looking for a file starting"
-		if test -z "$potlib" ; then
+		if test -z "$potlib"; then
 		  $ECHO "*** with $libname but no candidates were found. (...for file magic test)"
 		else
 		  $ECHO "*** with $libname and none of the candidates passed a file format test"
@@ -7865,30 +9357,30 @@ EOF
 	    -l*)
 	      func_stripname -l '' "$a_deplib"
 	      name=$func_stripname_result
-	      if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
+	      if test yes = "$allow_libtool_libs_with_static_runtimes"; then
 		case " $predeps $postdeps " in
 		*" $a_deplib "*)
 		  func_append newdeplibs " $a_deplib"
-		  a_deplib=""
+		  a_deplib=
 		  ;;
 		esac
 	      fi
-	      if test -n "$a_deplib" ; then
+	      if test -n "$a_deplib"; then
 		libname=`eval "\\$ECHO \"$libname_spec\""`
 		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
 		  potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
 		  for potent_lib in $potential_libs; do
-		    potlib="$potent_lib" # see symlink-check above in file_magic test
+		    potlib=$potent_lib # see symlink-check above in file_magic test
 		    if eval "\$ECHO \"$potent_lib\"" 2>/dev/null | $SED 10q | \
 		       $EGREP "$match_pattern_regex" > /dev/null; then
 		      func_append newdeplibs " $a_deplib"
-		      a_deplib=""
+		      a_deplib=
 		      break 2
 		    fi
 		  done
 		done
 	      fi
-	      if test -n "$a_deplib" ; then
+	      if test -n "$a_deplib"; then
 		droppeddeps=yes
 		echo
 		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
@@ -7896,7 +9388,7 @@ EOF
 		echo "*** you link to this library.  But I can only do this if you have a"
 		echo "*** shared version of the library, which you do not appear to have"
 		echo "*** because I did check the linker path looking for a file starting"
-		if test -z "$potlib" ; then
+		if test -z "$potlib"; then
 		  $ECHO "*** with $libname but no candidates were found. (...for regex pattern test)"
 		else
 		  $ECHO "*** with $libname and none of the candidates passed a file format test"
@@ -7912,18 +9404,18 @@ EOF
 	  done # Gone through all deplibs.
 	  ;;
 	none | unknown | *)
-	  newdeplibs=""
+	  newdeplibs=
 	  tmp_deplibs=`$ECHO " $deplibs" | $SED 's/ -lc$//; s/ -[LR][^ ]*//g'`
-	  if test "X$allow_libtool_libs_with_static_runtimes" = "Xyes" ; then
-	    for i in $predeps $postdeps ; do
+	  if test yes = "$allow_libtool_libs_with_static_runtimes"; then
+	    for i in $predeps $postdeps; do
 	      # can't use Xsed below, because $i might contain '/'
-	      tmp_deplibs=`$ECHO " $tmp_deplibs" | $SED "s,$i,,"`
+	      tmp_deplibs=`$ECHO " $tmp_deplibs" | $SED "s|$i||"`
 	    done
 	  fi
 	  case $tmp_deplibs in
 	  *[!\	\ ]*)
 	    echo
-	    if test "X$deplibs_check_method" = "Xnone"; then
+	    if test none = "$deplibs_check_method"; then
 	      echo "*** Warning: inter-library dependencies are not supported in this platform."
 	    else
 	      echo "*** Warning: inter-library dependencies are not known to be supported."
@@ -7947,8 +9439,8 @@ EOF
 	  ;;
 	esac
 
-	if test "$droppeddeps" = yes; then
-	  if test "$module" = yes; then
+	if test yes = "$droppeddeps"; then
+	  if test yes = "$module"; then
 	    echo
 	    echo "*** Warning: libtool could not satisfy all declared inter-library"
 	    $ECHO "*** dependencies of module $libname.  Therefore, libtool will create"
@@ -7957,12 +9449,12 @@ EOF
 	    if test -z "$global_symbol_pipe"; then
 	      echo
 	      echo "*** However, this would only work if libtool was able to extract symbol"
-	      echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+	      echo "*** lists from a program, using 'nm' or equivalent, but libtool could"
 	      echo "*** not find such a program.  So, this module is probably useless."
-	      echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	      echo "*** 'nm' from GNU binutils and a full rebuild may help."
 	    fi
-	    if test "$build_old_libs" = no; then
-	      oldlibs="$output_objdir/$libname.$libext"
+	    if test no = "$build_old_libs"; then
+	      oldlibs=$output_objdir/$libname.$libext
 	      build_libtool_libs=module
 	      build_old_libs=yes
 	    else
@@ -7973,14 +9465,14 @@ EOF
 	    echo "*** automatically added whenever a program is linked with this library"
 	    echo "*** or is declared to -dlopen it."
 
-	    if test "$allow_undefined" = no; then
+	    if test no = "$allow_undefined"; then
 	      echo
 	      echo "*** Since this library must not contain undefined symbols,"
 	      echo "*** because either the platform does not support them or"
 	      echo "*** it was explicitly requested with -no-undefined,"
 	      echo "*** libtool will only create a static version of it."
-	      if test "$build_old_libs" = no; then
-		oldlibs="$output_objdir/$libname.$libext"
+	      if test no = "$build_old_libs"; then
+		oldlibs=$output_objdir/$libname.$libext
 		build_libtool_libs=module
 		build_old_libs=yes
 	      else
@@ -8026,7 +9518,7 @@ EOF
 	*) func_append new_libs " $deplib" ;;
 	esac
       done
-      deplibs="$new_libs"
+      deplibs=$new_libs
 
       # All the library-specific variables (install_libdir is set above).
       library_names=
@@ -8034,25 +9526,25 @@ EOF
       dlname=
 
       # Test again, we may have decided not to build it any more
-      if test "$build_libtool_libs" = yes; then
-	# Remove ${wl} instances when linking with ld.
+      if test yes = "$build_libtool_libs"; then
+	# Remove $wl instances when linking with ld.
 	# FIXME: should test the right _cmds variable.
 	case $archive_cmds in
 	  *\$LD\ *) wl= ;;
         esac
-	if test "$hardcode_into_libs" = yes; then
+	if test yes = "$hardcode_into_libs"; then
 	  # Hardcode the library paths
 	  hardcode_libdirs=
 	  dep_rpath=
-	  rpath="$finalize_rpath"
-	  test "$opt_mode" != relink && rpath="$compile_rpath$rpath"
+	  rpath=$finalize_rpath
+	  test relink = "$opt_mode" || rpath=$compile_rpath$rpath
 	  for libdir in $rpath; do
 	    if test -n "$hardcode_libdir_flag_spec"; then
 	      if test -n "$hardcode_libdir_separator"; then
 		func_replace_sysroot "$libdir"
 		libdir=$func_replace_sysroot_result
 		if test -z "$hardcode_libdirs"; then
-		  hardcode_libdirs="$libdir"
+		  hardcode_libdirs=$libdir
 		else
 		  # Just accumulate the unique libdirs.
 		  case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
@@ -8077,7 +9569,7 @@ EOF
 	  # Substitute the hardcoded libdirs into the rpath.
 	  if test -n "$hardcode_libdir_separator" &&
 	     test -n "$hardcode_libdirs"; then
-	    libdir="$hardcode_libdirs"
+	    libdir=$hardcode_libdirs
 	    eval "dep_rpath=\"$hardcode_libdir_flag_spec\""
 	  fi
 	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
@@ -8091,8 +9583,8 @@ EOF
 	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
 	fi
 
-	shlibpath="$finalize_shlibpath"
-	test "$opt_mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+	shlibpath=$finalize_shlibpath
+	test relink = "$opt_mode" || shlibpath=$compile_shlibpath$shlibpath
 	if test -n "$shlibpath"; then
 	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
 	fi
@@ -8102,19 +9594,19 @@ EOF
 	eval library_names=\"$library_names_spec\"
 	set dummy $library_names
 	shift
-	realname="$1"
+	realname=$1
 	shift
 
 	if test -n "$soname_spec"; then
 	  eval soname=\"$soname_spec\"
 	else
-	  soname="$realname"
+	  soname=$realname
 	fi
 	if test -z "$dlname"; then
 	  dlname=$soname
 	fi
 
-	lib="$output_objdir/$realname"
+	lib=$output_objdir/$realname
 	linknames=
 	for link
 	do
@@ -8128,7 +9620,7 @@ EOF
 	delfiles=
 	if test -n "$export_symbols" && test -n "$include_expsyms"; then
 	  $opt_dry_run || cp "$export_symbols" "$output_objdir/$libname.uexp"
-	  export_symbols="$output_objdir/$libname.uexp"
+	  export_symbols=$output_objdir/$libname.uexp
 	  func_append delfiles " $export_symbols"
 	fi
 
@@ -8137,31 +9629,31 @@ EOF
 	cygwin* | mingw* | cegcc*)
 	  if test -n "$export_symbols" && test -z "$export_symbols_regex"; then
 	    # exporting using user supplied symfile
-	    if test "x`$SED 1q $export_symbols`" != xEXPORTS; then
+	    func_dll_def_p "$export_symbols" || {
 	      # and it's NOT already a .def file. Must figure out
 	      # which of the given symbols are data symbols and tag
 	      # them as such. So, trigger use of export_symbols_cmds.
 	      # export_symbols gets reassigned inside the "prepare
 	      # the list of exported symbols" if statement, so the
 	      # include_expsyms logic still works.
-	      orig_export_symbols="$export_symbols"
+	      orig_export_symbols=$export_symbols
 	      export_symbols=
 	      always_export_symbols=yes
-	    fi
+	    }
 	  fi
 	  ;;
 	esac
 
 	# Prepare the list of exported symbols
 	if test -z "$export_symbols"; then
-	  if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
-	    func_verbose "generating symbol list for \`$libname.la'"
-	    export_symbols="$output_objdir/$libname.exp"
+	  if test yes = "$always_export_symbols" || test -n "$export_symbols_regex"; then
+	    func_verbose "generating symbol list for '$libname.la'"
+	    export_symbols=$output_objdir/$libname.exp
 	    $opt_dry_run || $RM $export_symbols
 	    cmds=$export_symbols_cmds
-	    save_ifs="$IFS"; IFS='~'
+	    save_ifs=$IFS; IFS='~'
 	    for cmd1 in $cmds; do
-	      IFS="$save_ifs"
+	      IFS=$save_ifs
 	      # Take the normal branch if the nm_file_list_spec branch
 	      # doesn't work or if tool conversion is not needed.
 	      case $nm_file_list_spec~$to_tool_file_cmd in
@@ -8175,7 +9667,7 @@ EOF
 		  try_normal_branch=no
 		  ;;
 	      esac
-	      if test "$try_normal_branch" = yes \
+	      if test yes = "$try_normal_branch" \
 		 && { test "$len" -lt "$max_cmd_len" \
 		      || test "$max_cmd_len" -le -1; }
 	      then
@@ -8186,7 +9678,7 @@ EOF
 		output_la=$func_basename_result
 		save_libobjs=$libobjs
 		save_output=$output
-		output=${output_objdir}/${output_la}.nm
+		output=$output_objdir/$output_la.nm
 		func_to_tool_file "$output"
 		libobjs=$nm_file_list_spec$func_to_tool_file_result
 		func_append delfiles " $output"
@@ -8209,8 +9701,8 @@ EOF
 		break
 	      fi
 	    done
-	    IFS="$save_ifs"
-	    if test -n "$export_symbols_regex" && test "X$skipped_export" != "X:"; then
+	    IFS=$save_ifs
+	    if test -n "$export_symbols_regex" && test : != "$skipped_export"; then
 	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
 	      func_show_eval '$MV "${export_symbols}T" "$export_symbols"'
 	    fi
@@ -8218,16 +9710,16 @@ EOF
 	fi
 
 	if test -n "$export_symbols" && test -n "$include_expsyms"; then
-	  tmp_export_symbols="$export_symbols"
-	  test -n "$orig_export_symbols" && tmp_export_symbols="$orig_export_symbols"
+	  tmp_export_symbols=$export_symbols
+	  test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols
 	  $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
 	fi
 
-	if test "X$skipped_export" != "X:" && test -n "$orig_export_symbols"; then
+	if test : != "$skipped_export" && test -n "$orig_export_symbols"; then
 	  # The given exports_symbols file has to be filtered, so filter it.
-	  func_verbose "filter symbol list for \`$libname.la' to tag DATA exports"
+	  func_verbose "filter symbol list for '$libname.la' to tag DATA exports"
 	  # FIXME: $output_objdir/$libname.filter potentially contains lots of
-	  # 's' commands which not all seds can handle. GNU sed should be fine
+	  # 's' commands, which not all seds can handle. GNU sed should be fine
 	  # though. Also, the filter scales superlinearly with the number of
 	  # global variables. join(1) would be nice here, but unfortunately
 	  # isn't a blessed tool.
@@ -8246,11 +9738,11 @@ EOF
 	    ;;
 	  esac
 	done
-	deplibs="$tmp_deplibs"
+	deplibs=$tmp_deplibs
 
 	if test -n "$convenience"; then
 	  if test -n "$whole_archive_flag_spec" &&
-	    test "$compiler_needs_object" = yes &&
+	    test yes = "$compiler_needs_object" &&
 	    test -z "$libobjs"; then
 	    # extract the archives, so we have objects to list.
 	    # TODO: could optimize this to just extract one archive.
@@ -8261,7 +9753,7 @@ EOF
 	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
 	    test "X$libobjs" = "X " && libobjs=
 	  else
-	    gentop="$output_objdir/${outputname}x"
+	    gentop=$output_objdir/${outputname}x
 	    func_append generated " $gentop"
 
 	    func_extract_archives $gentop $convenience
@@ -8270,18 +9762,18 @@ EOF
 	  fi
 	fi
 
-	if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+	if test yes = "$thread_safe" && test -n "$thread_safe_flag_spec"; then
 	  eval flag=\"$thread_safe_flag_spec\"
 	  func_append linker_flags " $flag"
 	fi
 
 	# Make a backup of the uninstalled library when relinking
-	if test "$opt_mode" = relink; then
+	if test relink = "$opt_mode"; then
 	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}U && $MV $realname ${realname}U)' || exit $?
 	fi
 
 	# Do each of the archive commands.
-	if test "$module" = yes && test -n "$module_cmds" ; then
+	if test yes = "$module" && test -n "$module_cmds"; then
 	  if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
 	    eval test_cmds=\"$module_expsym_cmds\"
 	    cmds=$module_expsym_cmds
@@ -8299,7 +9791,7 @@ EOF
 	  fi
 	fi
 
-	if test "X$skipped_export" != "X:" &&
+	if test : != "$skipped_export" &&
 	   func_len " $test_cmds" &&
 	   len=$func_len_result &&
 	   test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then
@@ -8332,8 +9824,8 @@ EOF
 	  last_robj=
 	  k=1
 
-	  if test -n "$save_libobjs" && test "X$skipped_export" != "X:" && test "$with_gnu_ld" = yes; then
-	    output=${output_objdir}/${output_la}.lnkscript
+	  if test -n "$save_libobjs" && test : != "$skipped_export" && test yes = "$with_gnu_ld"; then
+	    output=$output_objdir/$output_la.lnkscript
 	    func_verbose "creating GNU ld script: $output"
 	    echo 'INPUT (' > $output
 	    for obj in $save_libobjs
@@ -8345,14 +9837,14 @@ EOF
 	    func_append delfiles " $output"
 	    func_to_tool_file "$output"
 	    output=$func_to_tool_file_result
-	  elif test -n "$save_libobjs" && test "X$skipped_export" != "X:" && test "X$file_list_spec" != X; then
-	    output=${output_objdir}/${output_la}.lnk
+	  elif test -n "$save_libobjs" && test : != "$skipped_export" && test -n "$file_list_spec"; then
+	    output=$output_objdir/$output_la.lnk
 	    func_verbose "creating linker input file list: $output"
 	    : > $output
 	    set x $save_libobjs
 	    shift
 	    firstobj=
-	    if test "$compiler_needs_object" = yes; then
+	    if test yes = "$compiler_needs_object"; then
 	      firstobj="$1 "
 	      shift
 	    fi
@@ -8367,7 +9859,7 @@ EOF
 	  else
 	    if test -n "$save_libobjs"; then
 	      func_verbose "creating reloadable object files..."
-	      output=$output_objdir/$output_la-${k}.$objext
+	      output=$output_objdir/$output_la-$k.$objext
 	      eval test_cmds=\"$reload_cmds\"
 	      func_len " $test_cmds"
 	      len0=$func_len_result
@@ -8379,13 +9871,13 @@ EOF
 		func_len " $obj"
 		func_arith $len + $func_len_result
 		len=$func_arith_result
-		if test "X$objlist" = X ||
+		if test -z "$objlist" ||
 		   test "$len" -lt "$max_cmd_len"; then
 		  func_append objlist " $obj"
 		else
 		  # The command $test_cmds is almost too long, add a
 		  # command to the queue.
-		  if test "$k" -eq 1 ; then
+		  if test 1 -eq "$k"; then
 		    # The first file doesn't have a previous command to add.
 		    reload_objs=$objlist
 		    eval concat_cmds=\"$reload_cmds\"
@@ -8395,10 +9887,10 @@ EOF
 		    reload_objs="$objlist $last_robj"
 		    eval concat_cmds=\"\$concat_cmds~$reload_cmds~\$RM $last_robj\"
 		  fi
-		  last_robj=$output_objdir/$output_la-${k}.$objext
+		  last_robj=$output_objdir/$output_la-$k.$objext
 		  func_arith $k + 1
 		  k=$func_arith_result
-		  output=$output_objdir/$output_la-${k}.$objext
+		  output=$output_objdir/$output_la-$k.$objext
 		  objlist=" $obj"
 		  func_len " $last_robj"
 		  func_arith $len0 + $func_len_result
@@ -8410,9 +9902,9 @@ EOF
 	      # files will link in the last one created.
 	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
 	      reload_objs="$objlist $last_robj"
-	      eval concat_cmds=\"\${concat_cmds}$reload_cmds\"
+	      eval concat_cmds=\"\$concat_cmds$reload_cmds\"
 	      if test -n "$last_robj"; then
-	        eval concat_cmds=\"\${concat_cmds}~\$RM $last_robj\"
+	        eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\"
 	      fi
 	      func_append delfiles " $output"
 
@@ -8420,9 +9912,9 @@ EOF
 	      output=
 	    fi
 
-	    if ${skipped_export-false}; then
-	      func_verbose "generating symbol list for \`$libname.la'"
-	      export_symbols="$output_objdir/$libname.exp"
+	    ${skipped_export-false} && {
+	      func_verbose "generating symbol list for '$libname.la'"
+	      export_symbols=$output_objdir/$libname.exp
 	      $opt_dry_run || $RM $export_symbols
 	      libobjs=$output
 	      # Append the command to create the export file.
@@ -8431,16 +9923,16 @@ EOF
 	      if test -n "$last_robj"; then
 		eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\"
 	      fi
-	    fi
+	    }
 
 	    test -n "$save_libobjs" &&
 	      func_verbose "creating a temporary reloadable object file: $output"
 
 	    # Loop through the commands generated above and execute them.
-	    save_ifs="$IFS"; IFS='~'
+	    save_ifs=$IFS; IFS='~'
 	    for cmd in $concat_cmds; do
-	      IFS="$save_ifs"
-	      $opt_silent || {
+	      IFS=$save_ifs
+	      $opt_quiet || {
 		  func_quote_for_expand "$cmd"
 		  eval "func_echo $func_quote_for_expand_result"
 	      }
@@ -8448,7 +9940,7 @@ EOF
 		lt_exit=$?
 
 		# Restore the uninstalled library and exit
-		if test "$opt_mode" = relink; then
+		if test relink = "$opt_mode"; then
 		  ( cd "$output_objdir" && \
 		    $RM "${realname}T" && \
 		    $MV "${realname}U" "$realname" )
@@ -8457,7 +9949,7 @@ EOF
 		exit $lt_exit
 	      }
 	    done
-	    IFS="$save_ifs"
+	    IFS=$save_ifs
 
 	    if test -n "$export_symbols_regex" && ${skipped_export-false}; then
 	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
@@ -8465,18 +9957,18 @@ EOF
 	    fi
 	  fi
 
-          if ${skipped_export-false}; then
+          ${skipped_export-false} && {
 	    if test -n "$export_symbols" && test -n "$include_expsyms"; then
-	      tmp_export_symbols="$export_symbols"
-	      test -n "$orig_export_symbols" && tmp_export_symbols="$orig_export_symbols"
+	      tmp_export_symbols=$export_symbols
+	      test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols
 	      $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
 	    fi
 
 	    if test -n "$orig_export_symbols"; then
 	      # The given exports_symbols file has to be filtered, so filter it.
-	      func_verbose "filter symbol list for \`$libname.la' to tag DATA exports"
+	      func_verbose "filter symbol list for '$libname.la' to tag DATA exports"
 	      # FIXME: $output_objdir/$libname.filter potentially contains lots of
-	      # 's' commands which not all seds can handle. GNU sed should be fine
+	      # 's' commands, which not all seds can handle. GNU sed should be fine
 	      # though. Also, the filter scales superlinearly with the number of
 	      # global variables. join(1) would be nice here, but unfortunately
 	      # isn't a blessed tool.
@@ -8485,7 +9977,7 @@ EOF
 	      export_symbols=$output_objdir/$libname.def
 	      $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols
 	    fi
-	  fi
+	  }
 
 	  libobjs=$output
 	  # Restore the value of output.
@@ -8499,7 +9991,7 @@ EOF
 	  # value of $libobjs for piecewise linking.
 
 	  # Do each of the archive commands.
-	  if test "$module" = yes && test -n "$module_cmds" ; then
+	  if test yes = "$module" && test -n "$module_cmds"; then
 	    if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
 	      cmds=$module_expsym_cmds
 	    else
@@ -8521,7 +10013,7 @@ EOF
 
 	# Add any objects from preloaded convenience libraries
 	if test -n "$dlprefiles"; then
-	  gentop="$output_objdir/${outputname}x"
+	  gentop=$output_objdir/${outputname}x
 	  func_append generated " $gentop"
 
 	  func_extract_archives $gentop $dlprefiles
@@ -8529,11 +10021,12 @@ EOF
 	  test "X$libobjs" = "X " && libobjs=
 	fi
 
-	save_ifs="$IFS"; IFS='~'
+	save_ifs=$IFS; IFS='~'
 	for cmd in $cmds; do
-	  IFS="$save_ifs"
+	  IFS=$sp$nl
 	  eval cmd=\"$cmd\"
-	  $opt_silent || {
+	  IFS=$save_ifs
+	  $opt_quiet || {
 	    func_quote_for_expand "$cmd"
 	    eval "func_echo $func_quote_for_expand_result"
 	  }
@@ -8541,7 +10034,7 @@ EOF
 	    lt_exit=$?
 
 	    # Restore the uninstalled library and exit
-	    if test "$opt_mode" = relink; then
+	    if test relink = "$opt_mode"; then
 	      ( cd "$output_objdir" && \
 	        $RM "${realname}T" && \
 		$MV "${realname}U" "$realname" )
@@ -8550,10 +10043,10 @@ EOF
 	    exit $lt_exit
 	  }
 	done
-	IFS="$save_ifs"
+	IFS=$save_ifs
 
 	# Restore the uninstalled library and exit
-	if test "$opt_mode" = relink; then
+	if test relink = "$opt_mode"; then
 	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}T && $MV $realname ${realname}T && $MV ${realname}U $realname)' || exit $?
 
 	  if test -n "$convenience"; then
@@ -8573,39 +10066,39 @@ EOF
 	done
 
 	# If -module or -export-dynamic was specified, set the dlname.
-	if test "$module" = yes || test "$export_dynamic" = yes; then
+	if test yes = "$module" || test yes = "$export_dynamic"; then
 	  # On all known operating systems, these are identical.
-	  dlname="$soname"
+	  dlname=$soname
 	fi
       fi
       ;;
 
     obj)
-      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
-	func_warning "\`-dlopen' is ignored for objects"
+      if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
+	func_warning "'-dlopen' is ignored for objects"
       fi
 
       case " $deplibs" in
       *\ -l* | *\ -L*)
-	func_warning "\`-l' and \`-L' are ignored for objects" ;;
+	func_warning "'-l' and '-L' are ignored for objects" ;;
       esac
 
       test -n "$rpath" && \
-	func_warning "\`-rpath' is ignored for objects"
+	func_warning "'-rpath' is ignored for objects"
 
       test -n "$xrpath" && \
-	func_warning "\`-R' is ignored for objects"
+	func_warning "'-R' is ignored for objects"
 
       test -n "$vinfo" && \
-	func_warning "\`-version-info' is ignored for objects"
+	func_warning "'-version-info' is ignored for objects"
 
       test -n "$release" && \
-	func_warning "\`-release' is ignored for objects"
+	func_warning "'-release' is ignored for objects"
 
       case $output in
       *.lo)
 	test -n "$objs$old_deplibs" && \
-	  func_fatal_error "cannot build library object \`$output' from non-libtool objects"
+	  func_fatal_error "cannot build library object '$output' from non-libtool objects"
 
 	libobj=$output
 	func_lo2o "$libobj"
@@ -8613,7 +10106,7 @@ EOF
 	;;
       *)
 	libobj=
-	obj="$output"
+	obj=$output
 	;;
       esac
 
@@ -8626,17 +10119,19 @@ EOF
       # the extraction.
       reload_conv_objs=
       gentop=
-      # reload_cmds runs $LD directly, so let us get rid of
-      # -Wl from whole_archive_flag_spec and hope we can get by with
-      # turning comma into space..
-      wl=
-
+      # if reload_cmds runs $LD directly, get rid of -Wl from
+      # whole_archive_flag_spec and hope we can get by with turning comma
+      # into space.
+      case $reload_cmds in
+        *\$LD[\ \$]*) wl= ;;
+      esac
       if test -n "$convenience"; then
 	if test -n "$whole_archive_flag_spec"; then
 	  eval tmp_whole_archive_flags=\"$whole_archive_flag_spec\"
-	  reload_conv_objs=$reload_objs\ `$ECHO "$tmp_whole_archive_flags" | $SED 's|,| |g'`
+	  test -n "$wl" || tmp_whole_archive_flags=`$ECHO "$tmp_whole_archive_flags" | $SED 's|,| |g'`
+	  reload_conv_objs=$reload_objs\ $tmp_whole_archive_flags
 	else
-	  gentop="$output_objdir/${obj}x"
+	  gentop=$output_objdir/${obj}x
 	  func_append generated " $gentop"
 
 	  func_extract_archives $gentop $convenience
@@ -8645,12 +10140,12 @@ EOF
       fi
 
       # If we're not building shared, we need to use non_pic_objs
-      test "$build_libtool_libs" != yes && libobjs="$non_pic_objects"
+      test yes = "$build_libtool_libs" || libobjs=$non_pic_objects
 
       # Create the old-style object.
-      reload_objs="$objs$old_deplibs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.${libext}$/d; /\.lib$/d; $lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+      reload_objs=$objs$old_deplibs' '`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; /\.lib$/d; $lo2o" | $NL2SP`' '$reload_conv_objs
 
-      output="$obj"
+      output=$obj
       func_execute_cmds "$reload_cmds" 'exit $?'
 
       # Exit if we aren't doing a library object file.
@@ -8662,7 +10157,7 @@ EOF
 	exit $EXIT_SUCCESS
       fi
 
-      if test "$build_libtool_libs" != yes; then
+      test yes = "$build_libtool_libs" || {
 	if test -n "$gentop"; then
 	  func_show_eval '${RM}r "$gentop"'
 	fi
@@ -8672,12 +10167,12 @@ EOF
 	# $show "echo timestamp > $libobj"
 	# $opt_dry_run || eval "echo timestamp > $libobj" || exit $?
 	exit $EXIT_SUCCESS
-      fi
+      }
 
-      if test -n "$pic_flag" || test "$pic_mode" != default; then
+      if test -n "$pic_flag" || test default != "$pic_mode"; then
 	# Only do commands if we really have different PIC objects.
 	reload_objs="$libobjs $reload_conv_objs"
-	output="$libobj"
+	output=$libobj
 	func_execute_cmds "$reload_cmds" 'exit $?'
       fi
 
@@ -8694,16 +10189,14 @@ EOF
 	          output=$func_stripname_result.exe;;
       esac
       test -n "$vinfo" && \
-	func_warning "\`-version-info' is ignored for programs"
+	func_warning "'-version-info' is ignored for programs"
 
       test -n "$release" && \
-	func_warning "\`-release' is ignored for programs"
+	func_warning "'-release' is ignored for programs"
 
-      test "$preload" = yes \
-        && test "$dlopen_support" = unknown \
-	&& test "$dlopen_self" = unknown \
-	&& test "$dlopen_self_static" = unknown && \
-	  func_warning "\`LT_INIT([dlopen])' not used. Assuming no dlopen support."
+      $preload \
+	&& test unknown,unknown,unknown = "$dlopen_support,$dlopen_self,$dlopen_self_static" \
+	&& func_warning "'LT_INIT([dlopen])' not used. Assuming no dlopen support."
 
       case $host in
       *-*-rhapsody* | *-*-darwin1.[012])
@@ -8717,11 +10210,11 @@ EOF
       *-*-darwin*)
 	# Don't allow lazy linking, it breaks C++ global constructors
 	# But is supposedly fixed on 10.4 or later (yay!).
-	if test "$tagname" = CXX ; then
+	if test CXX = "$tagname"; then
 	  case ${MACOSX_DEPLOYMENT_TARGET-10.0} in
 	    10.[0123])
-	      func_append compile_command " ${wl}-bind_at_load"
-	      func_append finalize_command " ${wl}-bind_at_load"
+	      func_append compile_command " $wl-bind_at_load"
+	      func_append finalize_command " $wl-bind_at_load"
 	    ;;
 	  esac
 	fi
@@ -8757,7 +10250,7 @@ EOF
 	*) func_append new_libs " $deplib" ;;
 	esac
       done
-      compile_deplibs="$new_libs"
+      compile_deplibs=$new_libs
 
 
       func_append compile_command " $compile_deplibs"
@@ -8781,7 +10274,7 @@ EOF
 	if test -n "$hardcode_libdir_flag_spec"; then
 	  if test -n "$hardcode_libdir_separator"; then
 	    if test -z "$hardcode_libdirs"; then
-	      hardcode_libdirs="$libdir"
+	      hardcode_libdirs=$libdir
 	    else
 	      # Just accumulate the unique libdirs.
 	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
@@ -8804,7 +10297,7 @@ EOF
 	fi
 	case $host in
 	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
-	  testbindir=`${ECHO} "$libdir" | ${SED} -e 's*/lib$*/bin*'`
+	  testbindir=`$ECHO "$libdir" | $SED -e 's*/lib$*/bin*'`
 	  case :$dllsearchpath: in
 	  *":$libdir:"*) ;;
 	  ::) dllsearchpath=$libdir;;
@@ -8821,10 +10314,10 @@ EOF
       # Substitute the hardcoded libdirs into the rpath.
       if test -n "$hardcode_libdir_separator" &&
 	 test -n "$hardcode_libdirs"; then
-	libdir="$hardcode_libdirs"
+	libdir=$hardcode_libdirs
 	eval rpath=\" $hardcode_libdir_flag_spec\"
       fi
-      compile_rpath="$rpath"
+      compile_rpath=$rpath
 
       rpath=
       hardcode_libdirs=
@@ -8832,7 +10325,7 @@ EOF
 	if test -n "$hardcode_libdir_flag_spec"; then
 	  if test -n "$hardcode_libdir_separator"; then
 	    if test -z "$hardcode_libdirs"; then
-	      hardcode_libdirs="$libdir"
+	      hardcode_libdirs=$libdir
 	    else
 	      # Just accumulate the unique libdirs.
 	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
@@ -8857,45 +10350,43 @@ EOF
       # Substitute the hardcoded libdirs into the rpath.
       if test -n "$hardcode_libdir_separator" &&
 	 test -n "$hardcode_libdirs"; then
-	libdir="$hardcode_libdirs"
+	libdir=$hardcode_libdirs
 	eval rpath=\" $hardcode_libdir_flag_spec\"
       fi
-      finalize_rpath="$rpath"
+      finalize_rpath=$rpath
 
-      if test -n "$libobjs" && test "$build_old_libs" = yes; then
+      if test -n "$libobjs" && test yes = "$build_old_libs"; then
 	# Transform all the library objects into standard objects.
 	compile_command=`$ECHO "$compile_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
 	finalize_command=`$ECHO "$finalize_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
       fi
 
-      func_generate_dlsyms "$outputname" "@PROGRAM@" "no"
+      func_generate_dlsyms "$outputname" "@PROGRAM@" false
 
       # template prelinking step
       if test -n "$prelink_cmds"; then
 	func_execute_cmds "$prelink_cmds" 'exit $?'
       fi
 
-      wrappers_required=yes
+      wrappers_required=:
       case $host in
       *cegcc* | *mingw32ce*)
         # Disable wrappers for cegcc and mingw32ce hosts, we are cross compiling anyway.
-        wrappers_required=no
+        wrappers_required=false
         ;;
       *cygwin* | *mingw* )
-        if test "$build_libtool_libs" != yes; then
-          wrappers_required=no
-        fi
+        test yes = "$build_libtool_libs" || wrappers_required=false
         ;;
       *)
-        if test "$need_relink" = no || test "$build_libtool_libs" != yes; then
-          wrappers_required=no
+        if test no = "$need_relink" || test yes != "$build_libtool_libs"; then
+          wrappers_required=false
         fi
         ;;
       esac
-      if test "$wrappers_required" = no; then
+      $wrappers_required || {
 	# Replace the output file specification.
 	compile_command=`$ECHO "$compile_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
-	link_command="$compile_command$compile_rpath"
+	link_command=$compile_command$compile_rpath
 
 	# We have no uninstalled library dependencies, so finalize right now.
 	exit_status=0
@@ -8908,12 +10399,12 @@ EOF
 	fi
 
 	# Delete the generated files.
-	if test -f "$output_objdir/${outputname}S.${objext}"; then
-	  func_show_eval '$RM "$output_objdir/${outputname}S.${objext}"'
+	if test -f "$output_objdir/${outputname}S.$objext"; then
+	  func_show_eval '$RM "$output_objdir/${outputname}S.$objext"'
 	fi
 
 	exit $exit_status
-      fi
+      }
 
       if test -n "$compile_shlibpath$finalize_shlibpath"; then
 	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
@@ -8943,9 +10434,9 @@ EOF
 	fi
       fi
 
-      if test "$no_install" = yes; then
+      if test yes = "$no_install"; then
 	# We don't need to create a wrapper script.
-	link_command="$compile_var$compile_command$compile_rpath"
+	link_command=$compile_var$compile_command$compile_rpath
 	# Replace the output file specification.
 	link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
 	# Delete the old output file.
@@ -8962,27 +10453,28 @@ EOF
 	exit $EXIT_SUCCESS
       fi
 
-      if test "$hardcode_action" = relink; then
-	# Fast installation is not supported
-	link_command="$compile_var$compile_command$compile_rpath"
-	relink_command="$finalize_var$finalize_command$finalize_rpath"
+      case $hardcode_action,$fast_install in
+        relink,*)
+	  # Fast installation is not supported
+	  link_command=$compile_var$compile_command$compile_rpath
+	  relink_command=$finalize_var$finalize_command$finalize_rpath
 
-	func_warning "this platform does not like uninstalled shared libraries"
-	func_warning "\`$output' will be relinked during installation"
-      else
-	if test "$fast_install" != no; then
-	  link_command="$finalize_var$compile_command$finalize_rpath"
-	  if test "$fast_install" = yes; then
-	    relink_command=`$ECHO "$compile_var$compile_command$compile_rpath" | $SED 's%@OUTPUT@%\$progdir/\$file%g'`
-	  else
-	    # fast_install is set to needless
-	    relink_command=
-	  fi
-	else
-	  link_command="$compile_var$compile_command$compile_rpath"
-	  relink_command="$finalize_var$finalize_command$finalize_rpath"
-	fi
-      fi
+	  func_warning "this platform does not like uninstalled shared libraries"
+	  func_warning "'$output' will be relinked during installation"
+	  ;;
+        *,yes)
+	  link_command=$finalize_var$compile_command$finalize_rpath
+	  relink_command=`$ECHO "$compile_var$compile_command$compile_rpath" | $SED 's%@OUTPUT@%\$progdir/\$file%g'`
+          ;;
+	*,no)
+	  link_command=$compile_var$compile_command$compile_rpath
+	  relink_command=$finalize_var$finalize_command$finalize_rpath
+          ;;
+	*,needless)
+	  link_command=$finalize_var$compile_command$finalize_rpath
+	  relink_command=
+          ;;
+      esac
 
       # Replace the output file specification.
       link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
@@ -9039,8 +10531,8 @@ EOF
 	    func_dirname_and_basename "$output" "" "."
 	    output_name=$func_basename_result
 	    output_path=$func_dirname_result
-	    cwrappersource="$output_path/$objdir/lt-$output_name.c"
-	    cwrapper="$output_path/$output_name.exe"
+	    cwrappersource=$output_path/$objdir/lt-$output_name.c
+	    cwrapper=$output_path/$output_name.exe
 	    $RM $cwrappersource $cwrapper
 	    trap "$RM $cwrappersource $cwrapper; exit $EXIT_FAILURE" 1 2 15
 
@@ -9061,7 +10553,7 @@ EOF
 	    trap "$RM $func_ltwrapper_scriptname_result; exit $EXIT_FAILURE" 1 2 15
 	    $opt_dry_run || {
 	      # note: this script will not be executed, so do not chmod.
-	      if test "x$build" = "x$host" ; then
+	      if test "x$build" = "x$host"; then
 		$cwrapper --lt-dump-script > $func_ltwrapper_scriptname_result
 	      else
 		func_emit_wrapper no > $func_ltwrapper_scriptname_result
@@ -9084,25 +10576,27 @@ EOF
     # See if we need to build an old-fashioned archive.
     for oldlib in $oldlibs; do
 
-      if test "$build_libtool_libs" = convenience; then
-	oldobjs="$libobjs_save $symfileobj"
-	addlibs="$convenience"
-	build_libtool_libs=no
-      else
-	if test "$build_libtool_libs" = module; then
-	  oldobjs="$libobjs_save"
+      case $build_libtool_libs in
+        convenience)
+	  oldobjs="$libobjs_save $symfileobj"
+	  addlibs=$convenience
 	  build_libtool_libs=no
-	else
+	  ;;
+	module)
+	  oldobjs=$libobjs_save
+	  addlibs=$old_convenience
+	  build_libtool_libs=no
+          ;;
+	*)
 	  oldobjs="$old_deplibs $non_pic_objects"
-	  if test "$preload" = yes && test -f "$symfileobj"; then
-	    func_append oldobjs " $symfileobj"
-	  fi
-	fi
-	addlibs="$old_convenience"
-      fi
+	  $preload && test -f "$symfileobj" \
+	    && func_append oldobjs " $symfileobj"
+	  addlibs=$old_convenience
+	  ;;
+      esac
 
       if test -n "$addlibs"; then
-	gentop="$output_objdir/${outputname}x"
+	gentop=$output_objdir/${outputname}x
 	func_append generated " $gentop"
 
 	func_extract_archives $gentop $addlibs
@@ -9110,13 +10604,13 @@ EOF
       fi
 
       # Do each command in the archive commands.
-      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+      if test -n "$old_archive_from_new_cmds" && test yes = "$build_libtool_libs"; then
 	cmds=$old_archive_from_new_cmds
       else
 
 	# Add any objects from preloaded convenience libraries
 	if test -n "$dlprefiles"; then
-	  gentop="$output_objdir/${outputname}x"
+	  gentop=$output_objdir/${outputname}x
 	  func_append generated " $gentop"
 
 	  func_extract_archives $gentop $dlprefiles
@@ -9137,7 +10631,7 @@ EOF
 	  :
 	else
 	  echo "copying selected object files to avoid basename conflicts..."
-	  gentop="$output_objdir/${outputname}x"
+	  gentop=$output_objdir/${outputname}x
 	  func_append generated " $gentop"
 	  func_mkdir_p "$gentop"
 	  save_oldobjs=$oldobjs
@@ -9146,7 +10640,7 @@ EOF
 	  for obj in $save_oldobjs
 	  do
 	    func_basename "$obj"
-	    objbase="$func_basename_result"
+	    objbase=$func_basename_result
 	    case " $oldobjs " in
 	    " ") oldobjs=$obj ;;
 	    *[\ /]"$objbase "*)
@@ -9215,18 +10709,18 @@ EOF
 	    else
 	      # the above command should be used before it gets too long
 	      oldobjs=$objlist
-	      if test "$obj" = "$last_oldobj" ; then
+	      if test "$obj" = "$last_oldobj"; then
 		RANLIB=$save_RANLIB
 	      fi
 	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
-	      eval concat_cmds=\"\${concat_cmds}$old_archive_cmds\"
+	      eval concat_cmds=\"\$concat_cmds$old_archive_cmds\"
 	      objlist=
 	      len=$len0
 	    fi
 	  done
 	  RANLIB=$save_RANLIB
 	  oldobjs=$objlist
-	  if test "X$oldobjs" = "X" ; then
+	  if test -z "$oldobjs"; then
 	    eval cmds=\"\$concat_cmds\"
 	  else
 	    eval cmds=\"\$concat_cmds~\$old_archive_cmds\"
@@ -9243,7 +10737,7 @@ EOF
     case $output in
     *.la)
       old_library=
-      test "$build_old_libs" = yes && old_library="$libname.$libext"
+      test yes = "$build_old_libs" && old_library=$libname.$libext
       func_verbose "creating $output"
 
       # Preserve any variables that may affect compiler behavior
@@ -9258,31 +10752,31 @@ EOF
 	fi
       done
       # Quote the link command for shipping.
-      relink_command="(cd `pwd`; $SHELL $progpath $preserve_args --mode=relink $libtool_args @inst_prefix_dir@)"
+      relink_command="(cd `pwd`; $SHELL \"$progpath\" $preserve_args --mode=relink $libtool_args @inst_prefix_dir@)"
       relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"`
-      if test "$hardcode_automatic" = yes ; then
+      if test yes = "$hardcode_automatic"; then
 	relink_command=
       fi
 
       # Only create the output if not a dry run.
       $opt_dry_run || {
 	for installed in no yes; do
-	  if test "$installed" = yes; then
+	  if test yes = "$installed"; then
 	    if test -z "$install_libdir"; then
 	      break
 	    fi
-	    output="$output_objdir/$outputname"i
+	    output=$output_objdir/${outputname}i
 	    # Replace all uninstalled libtool libraries with the installed ones
 	    newdependency_libs=
 	    for deplib in $dependency_libs; do
 	      case $deplib in
 	      *.la)
 		func_basename "$deplib"
-		name="$func_basename_result"
+		name=$func_basename_result
 		func_resolve_sysroot "$deplib"
-		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result`
+		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result`
 		test -z "$libdir" && \
-		  func_fatal_error "\`$deplib' is not a valid libtool archive"
+		  func_fatal_error "'$deplib' is not a valid libtool archive"
 		func_append newdependency_libs " ${lt_sysroot:+=}$libdir/$name"
 		;;
 	      -L*)
@@ -9298,23 +10792,23 @@ EOF
 	      *) func_append newdependency_libs " $deplib" ;;
 	      esac
 	    done
-	    dependency_libs="$newdependency_libs"
+	    dependency_libs=$newdependency_libs
 	    newdlfiles=
 
 	    for lib in $dlfiles; do
 	      case $lib in
 	      *.la)
 	        func_basename "$lib"
-		name="$func_basename_result"
-		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+		name=$func_basename_result
+		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
 		test -z "$libdir" && \
-		  func_fatal_error "\`$lib' is not a valid libtool archive"
+		  func_fatal_error "'$lib' is not a valid libtool archive"
 		func_append newdlfiles " ${lt_sysroot:+=}$libdir/$name"
 		;;
 	      *) func_append newdlfiles " $lib" ;;
 	      esac
 	    done
-	    dlfiles="$newdlfiles"
+	    dlfiles=$newdlfiles
 	    newdlprefiles=
 	    for lib in $dlprefiles; do
 	      case $lib in
@@ -9324,34 +10818,34 @@ EOF
 		# didn't already link the preopened objects directly into
 		# the library:
 		func_basename "$lib"
-		name="$func_basename_result"
-		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+		name=$func_basename_result
+		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
 		test -z "$libdir" && \
-		  func_fatal_error "\`$lib' is not a valid libtool archive"
+		  func_fatal_error "'$lib' is not a valid libtool archive"
 		func_append newdlprefiles " ${lt_sysroot:+=}$libdir/$name"
 		;;
 	      esac
 	    done
-	    dlprefiles="$newdlprefiles"
+	    dlprefiles=$newdlprefiles
 	  else
 	    newdlfiles=
 	    for lib in $dlfiles; do
 	      case $lib in
-		[\\/]* | [A-Za-z]:[\\/]*) abs="$lib" ;;
+		[\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;;
 		*) abs=`pwd`"/$lib" ;;
 	      esac
 	      func_append newdlfiles " $abs"
 	    done
-	    dlfiles="$newdlfiles"
+	    dlfiles=$newdlfiles
 	    newdlprefiles=
 	    for lib in $dlprefiles; do
 	      case $lib in
-		[\\/]* | [A-Za-z]:[\\/]*) abs="$lib" ;;
+		[\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;;
 		*) abs=`pwd`"/$lib" ;;
 	      esac
 	      func_append newdlprefiles " $abs"
 	    done
-	    dlprefiles="$newdlprefiles"
+	    dlprefiles=$newdlprefiles
 	  fi
 	  $RM $output
 	  # place dlname in correct position for cygwin
@@ -9367,10 +10861,9 @@ EOF
 	  case $host,$output,$installed,$module,$dlname in
 	    *cygwin*,*lai,yes,no,*.dll | *mingw*,*lai,yes,no,*.dll | *cegcc*,*lai,yes,no,*.dll)
 	      # If a -bindir argument was supplied, place the dll there.
-	      if test "x$bindir" != x ;
-	      then
+	      if test -n "$bindir"; then
 		func_relative_path "$install_libdir" "$bindir"
-		tdlname=$func_relative_path_result$dlname
+		tdlname=$func_relative_path_result/$dlname
 	      else
 		# Otherwise fall back on heuristic.
 		tdlname=../bin/$dlname
@@ -9379,7 +10872,7 @@ EOF
 	  esac
 	  $ECHO > $output "\
 # $outputname - a libtool library file
-# Generated by $PROGRAM (GNU $PACKAGE$TIMESTAMP) $VERSION
+# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
 #
 # Please DO NOT delete this file!
 # It is necessary for linking the library.
@@ -9393,7 +10886,7 @@ library_names='$library_names'
 # The name of the static archive.
 old_library='$old_library'
 
-# Linker flags that can not go in dependency_libs.
+# Linker flags that cannot go in dependency_libs.
 inherited_linker_flags='$new_inherited_linker_flags'
 
 # Libraries that this one depends upon.
@@ -9419,7 +10912,7 @@ dlpreopen='$dlprefiles'
 
 # Directory that this library needs to be installed in:
 libdir='$install_libdir'"
-	  if test "$installed" = no && test "$need_relink" = yes; then
+	  if test no,yes = "$installed,$need_relink"; then
 	    $ECHO >> $output "\
 relink_command=\"$relink_command\""
 	  fi
@@ -9434,27 +10927,29 @@ relink_command=\"$relink_command\""
     exit $EXIT_SUCCESS
 }
 
-{ test "$opt_mode" = link || test "$opt_mode" = relink; } &&
-    func_mode_link ${1+"$@"}
+if test link = "$opt_mode" || test relink = "$opt_mode"; then
+  func_mode_link ${1+"$@"}
+fi
 
 
 # func_mode_uninstall arg...
 func_mode_uninstall ()
 {
-    $opt_debug
-    RM="$nonopt"
+    $debug_cmd
+
+    RM=$nonopt
     files=
-    rmforce=
+    rmforce=false
     exit_status=0
 
     # This variable tells wrapper scripts just to set variables rather
     # than running their programs.
-    libtool_install_magic="$magic"
+    libtool_install_magic=$magic
 
     for arg
     do
       case $arg in
-      -f) func_append RM " $arg"; rmforce=yes ;;
+      -f) func_append RM " $arg"; rmforce=: ;;
       -*) func_append RM " $arg" ;;
       *) func_append files " $arg" ;;
       esac
@@ -9467,18 +10962,18 @@ func_mode_uninstall ()
 
     for file in $files; do
       func_dirname "$file" "" "."
-      dir="$func_dirname_result"
-      if test "X$dir" = X.; then
-	odir="$objdir"
+      dir=$func_dirname_result
+      if test . = "$dir"; then
+	odir=$objdir
       else
-	odir="$dir/$objdir"
+	odir=$dir/$objdir
       fi
       func_basename "$file"
-      name="$func_basename_result"
-      test "$opt_mode" = uninstall && odir="$dir"
+      name=$func_basename_result
+      test uninstall = "$opt_mode" && odir=$dir
 
       # Remember odir for removal later, being careful to avoid duplicates
-      if test "$opt_mode" = clean; then
+      if test clean = "$opt_mode"; then
 	case " $rmdirs " in
 	  *" $odir "*) ;;
 	  *) func_append rmdirs " $odir" ;;
@@ -9493,11 +10988,11 @@ func_mode_uninstall ()
       elif test -d "$file"; then
 	exit_status=1
 	continue
-      elif test "$rmforce" = yes; then
+      elif $rmforce; then
 	continue
       fi
 
-      rmfiles="$file"
+      rmfiles=$file
 
       case $name in
       *.la)
@@ -9511,7 +11006,7 @@ func_mode_uninstall ()
 	  done
 	  test -n "$old_library" && func_append rmfiles " $odir/$old_library"
 
-	  case "$opt_mode" in
+	  case $opt_mode in
 	  clean)
 	    case " $library_names " in
 	    *" $dlname "*) ;;
@@ -9522,12 +11017,12 @@ func_mode_uninstall ()
 	  uninstall)
 	    if test -n "$library_names"; then
 	      # Do each command in the postuninstall commands.
-	      func_execute_cmds "$postuninstall_cmds" 'test "$rmforce" = yes || exit_status=1'
+	      func_execute_cmds "$postuninstall_cmds" '$rmforce || exit_status=1'
 	    fi
 
 	    if test -n "$old_library"; then
 	      # Do each command in the old_postuninstall commands.
-	      func_execute_cmds "$old_postuninstall_cmds" 'test "$rmforce" = yes || exit_status=1'
+	      func_execute_cmds "$old_postuninstall_cmds" '$rmforce || exit_status=1'
 	    fi
 	    # FIXME: should reinstall the best remaining shared library.
 	    ;;
@@ -9543,21 +11038,19 @@ func_mode_uninstall ()
 	  func_source $dir/$name
 
 	  # Add PIC object to the list of files to remove.
-	  if test -n "$pic_object" &&
-	     test "$pic_object" != none; then
+	  if test -n "$pic_object" && test none != "$pic_object"; then
 	    func_append rmfiles " $dir/$pic_object"
 	  fi
 
 	  # Add non-PIC object to the list of files to remove.
-	  if test -n "$non_pic_object" &&
-	     test "$non_pic_object" != none; then
+	  if test -n "$non_pic_object" && test none != "$non_pic_object"; then
 	    func_append rmfiles " $dir/$non_pic_object"
 	  fi
 	fi
 	;;
 
       *)
-	if test "$opt_mode" = clean ; then
+	if test clean = "$opt_mode"; then
 	  noexename=$name
 	  case $file in
 	  *.exe)
@@ -9584,12 +11077,12 @@ func_mode_uninstall ()
 
 	    # note $name still contains .exe if it was in $file originally
 	    # as does the version of $file that was added into $rmfiles
-	    func_append rmfiles " $odir/$name $odir/${name}S.${objext}"
-	    if test "$fast_install" = yes && test -n "$relink_command"; then
+	    func_append rmfiles " $odir/$name $odir/${name}S.$objext"
+	    if test yes = "$fast_install" && test -n "$relink_command"; then
 	      func_append rmfiles " $odir/lt-$name"
 	    fi
-	    if test "X$noexename" != "X$name" ; then
-	      func_append rmfiles " $odir/lt-${noexename}.c"
+	    if test "X$noexename" != "X$name"; then
+	      func_append rmfiles " $odir/lt-$noexename.c"
 	    fi
 	  fi
 	fi
@@ -9598,7 +11091,7 @@ func_mode_uninstall ()
       func_show_eval "$RM $rmfiles" 'exit_status=1'
     done
 
-    # Try to remove the ${objdir}s in the directories where we deleted files
+    # Try to remove the $objdir's in the directories where we deleted files
     for dir in $rmdirs; do
       if test -d "$dir"; then
 	func_show_eval "rmdir $dir >/dev/null 2>&1"
@@ -9608,16 +11101,17 @@ func_mode_uninstall ()
     exit $exit_status
 }
 
-{ test "$opt_mode" = uninstall || test "$opt_mode" = clean; } &&
-    func_mode_uninstall ${1+"$@"}
+if test uninstall = "$opt_mode" || test clean = "$opt_mode"; then
+  func_mode_uninstall ${1+"$@"}
+fi
 
 test -z "$opt_mode" && {
-  help="$generic_help"
+  help=$generic_help
   func_fatal_help "you must specify a MODE"
 }
 
 test -z "$exec_cmd" && \
-  func_fatal_help "invalid operation mode \`$opt_mode'"
+  func_fatal_help "invalid operation mode '$opt_mode'"
 
 if test -n "$exec_cmd"; then
   eval exec "$exec_cmd"
@@ -9628,7 +11122,7 @@ exit $exit_status
 
 
 # The TAGs below are defined such that we never get into a situation
-# in which we disable both kinds of libraries.  Given conflicting
+# where we disable both kinds of libraries.  Given conflicting
 # choices, we go for a static library, that is the most portable,
 # since we can't tell whether shared libraries were disabled because
 # the user asked for that or because the platform doesn't support
@@ -9651,5 +11145,3 @@ build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac`
 # mode:shell-script
 # sh-indentation:2
 # End:
-# vi:sw=2
-
diff --git a/m4/acx_pthread.m4 b/m4/acx_pthread.m4
new file mode 100644
index 0000000000000000000000000000000000000000..2cf20de144a11be2aa603b04ea511244191037b7
--- /dev/null
+++ b/m4/acx_pthread.m4
@@ -0,0 +1,363 @@
+# This was retrieved from
+#    http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?revision=1277&root=avahi
+# See also (perhaps for new versions?)
+#    http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?root=avahi
+#
+# We've rewritten the inconsistency check code (from avahi), to work
+# more broadly.  In particular, it no longer assumes ld accepts -zdefs.
+# This caused a restructing of the code, but the functionality has only
+# changed a little.
+
+dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl
+dnl @summary figure out how to build C programs using POSIX threads
+dnl
+dnl This macro figures out how to build C programs using POSIX threads.
+dnl It sets the PTHREAD_LIBS output variable to the threads library and
+dnl linker flags, and the PTHREAD_CFLAGS output variable to any special
+dnl C compiler flags that are needed. (The user can also force certain
+dnl compiler flags/libs to be tested by setting these environment
+dnl variables.)
+dnl
+dnl Also sets PTHREAD_CC to any special C compiler that is needed for
+dnl multi-threaded programs (defaults to the value of CC otherwise).
+dnl (This is necessary on AIX to use the special cc_r compiler alias.)
+dnl
+dnl NOTE: You are assumed to not only compile your program with these
+dnl flags, but also link it with them as well. e.g. you should link
+dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS
+dnl $LIBS
+dnl
+dnl If you are only building threads programs, you may wish to use
+dnl these variables in your default LIBS, CFLAGS, and CC:
+dnl
+dnl        LIBS="$PTHREAD_LIBS $LIBS"
+dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+dnl        CC="$PTHREAD_CC"
+dnl
+dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
+dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to
+dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
+dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to
+dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the
+dnl default action will define HAVE_PTHREAD.
+dnl
+dnl Please let the authors know if this macro fails on any platform, or
+dnl if you have any other suggestions or comments. This macro was based
+dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with
+dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros
+dnl posted by Alejandro Forero Cuervo to the autoconf macro repository.
+dnl We are also grateful for the helpful feedback of numerous users.
+dnl
+dnl @category InstalledPackages
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
+dnl @version 2006-05-29
+dnl @license GPLWithACException
+dnl 
+dnl Checks for GCC shared/pthread inconsistency based on work by
+dnl Marcin Owsiany <marcin@owsiany.pl>
+
+
+AC_DEFUN([ACX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_LANG_SAVE
+AC_LANG_C
+acx_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+		pthread-config)
+		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_TRY_LINK([#include <pthread.h>],
+                    [pthread_t th; pthread_join(th, 0);
+                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
+                     pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+                    [acx_pthread_ok=yes])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+	AC_MSG_CHECKING([for joinable pthread attribute])
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
+                        [attr_name=$attr; break])
+	done
+        AC_MSG_RESULT($attr_name)
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
+                               [Define to necessary symbol if this constant
+                                uses a non-standard name on your system.])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case "${host_cpu}-${host_os}" in
+            *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
+            *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
+        esac
+        AC_MSG_RESULT(${flag})
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+        # More AIX lossage: must compile with xlc_r or cc_r
+	if test x"$GCC" != xyes; then
+          AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
+        else
+          PTHREAD_CC=$CC
+	fi
+
+	# The next part tries to detect GCC inconsistency with -shared on some
+	# architectures and systems. The problem is that in certain
+	# configurations, when -shared is specified, GCC "forgets" to
+	# internally use various flags which are still necessary.
+	
+	#
+	# Prepare the flags
+	#
+	save_CFLAGS="$CFLAGS"
+	save_LIBS="$LIBS"
+	save_CC="$CC"
+	
+	# Try with the flags determined by the earlier checks.
+	#
+	# -Wl,-z,defs forces link-time symbol resolution, so that the
+	# linking checks with -shared actually have any value
+	#
+	# FIXME: -fPIC is required for -shared on many architectures,
+	# so we specify it here, but the right way would probably be to
+	# properly detect whether it is actually required.
+	CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS"
+	LIBS="$PTHREAD_LIBS $LIBS"
+	CC="$PTHREAD_CC"
+	
+	# In order not to create several levels of indentation, we test
+	# the value of "$done" until we find the cure or run out of ideas.
+	done="no"
+	
+	# First, make sure the CFLAGS we added are actually accepted by our
+	# compiler.  If not (and OS X's ld, for instance, does not accept -z),
+	# then we can't do this test.
+	if test x"$done" = xno; then
+	   AC_MSG_CHECKING([whether to check for GCC pthread/shared inconsistencies])
+	   AC_TRY_LINK(,, , [done=yes])
+	
+	   if test "x$done" = xyes ; then
+	      AC_MSG_RESULT([no])
+	   else
+	      AC_MSG_RESULT([yes])
+	   fi
+	fi
+	
+	if test x"$done" = xno; then
+	   AC_MSG_CHECKING([whether -pthread is sufficient with -shared])
+	   AC_TRY_LINK([#include <pthread.h>],
+	      [pthread_t th; pthread_join(th, 0);
+	      pthread_attr_init(0); pthread_cleanup_push(0, 0);
+	      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+	      [done=yes])
+	   
+	   if test "x$done" = xyes; then
+	      AC_MSG_RESULT([yes])
+	   else
+	      AC_MSG_RESULT([no])
+	   fi
+	fi
+	
+	#
+	# Linux gcc on some architectures such as mips/mipsel forgets
+	# about -lpthread
+	#
+	if test x"$done" = xno; then
+	   AC_MSG_CHECKING([whether -lpthread fixes that])
+	   LIBS="-lpthread $PTHREAD_LIBS $save_LIBS"
+	   AC_TRY_LINK([#include <pthread.h>],
+	      [pthread_t th; pthread_join(th, 0);
+	      pthread_attr_init(0); pthread_cleanup_push(0, 0);
+	      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+	      [done=yes])
+	
+	   if test "x$done" = xyes; then
+	      AC_MSG_RESULT([yes])
+	      PTHREAD_LIBS="-lpthread $PTHREAD_LIBS"
+	   else
+	      AC_MSG_RESULT([no])
+	   fi
+	fi
+	#
+	# FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc
+	#
+	if test x"$done" = xno; then
+	   AC_MSG_CHECKING([whether -lc_r fixes that])
+	   LIBS="-lc_r $PTHREAD_LIBS $save_LIBS"
+	   AC_TRY_LINK([#include <pthread.h>],
+	       [pthread_t th; pthread_join(th, 0);
+	        pthread_attr_init(0); pthread_cleanup_push(0, 0);
+	        pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+	       [done=yes])
+	
+	   if test "x$done" = xyes; then
+	      AC_MSG_RESULT([yes])
+	      PTHREAD_LIBS="-lc_r $PTHREAD_LIBS"
+	   else
+	      AC_MSG_RESULT([no])
+	   fi
+	fi
+	if test x"$done" = xno; then
+	   # OK, we have run out of ideas
+	   AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries])
+	
+	   # so it's not safe to assume that we may use pthreads
+	   acx_pthread_ok=no
+	fi
+	
+	CFLAGS="$save_CFLAGS"
+	LIBS="$save_LIBS"
+	CC="$save_CC"
+else
+        PTHREAD_CC="$CC"
+fi
+
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+AC_SUBST(PTHREAD_CC)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        :
+else
+        acx_pthread_ok=no
+        $2
+fi
+AC_LANG_RESTORE
+])dnl ACX_PTHREAD
diff --git a/m4/ax_cuda.m4 b/m4/ax_cuda.m4
index 78f9b982e68b525bc4bff3bd778f8e5c17a97516..d69205a3f410f34e913116cf2e76806c39d55db9 100644
--- a/m4/ax_cuda.m4
+++ b/m4/ax_cuda.m4
@@ -86,7 +86,8 @@ then
 	AC_MSG_RESULT([nvcc version : $NVCC_VERSION])
 	
 	# test if architecture is 64 bits and NVCC version >= 2.3
-        libdir=lib
+        #libdir=lib #NOTE: was lib, but changed to lib64 for CUDA 8.0
+        libdir=lib64
 	if test "x$host_cpu" = xx86_64 ; then
 	   if test "x$NVCC_VERSION" \> "x2.2" ; then
               libdir=lib64
@@ -143,7 +144,7 @@ then
 			cuModuleLoad(&cuModule, "myModule.cubin");
 			CUdeviceptr devPtr;
 			CUfunction cuFunction;
-			unsigned pitch, width = 250, height = 500;
+			size_t pitch, width = 250, height = 500;
 			cuMemAllocPitch(&devPtr, &pitch,width * sizeof(float), height, 4);
 			cuModuleGetFunction(&cuFunction, cuModule, "myKernel");
 			cuFuncSetBlockShape(cuFunction, 512, 1, 1);
@@ -215,23 +216,23 @@ then
     NVCCFLAGS=" -deviceemu"
 fi
 #
-AS_IF([test "x$want_cuda" = xyes],
-    [AS_IF([test "x$NVCCFLAGS" = x],
-        [dnl generate CUDA code for broad spectrum of devices
-         dnl Note: cc 13 for Tesla
-         dnl Note: cc 20 for Fermi
-	 dnl Note: cc 30 for Kepler K10
-	 dnl Note: cc 35 for Kepler K20
-         NVCCFLAGS=["-gencode arch=compute_10,code=sm_10 \
- -gencode arch=compute_11,code=sm_11 \
- -gencode arch=compute_13,code=sm_13 \
- -gencode arch=compute_20,code=sm_20 \
- -gencode arch=compute_30,code=sm_30 \
- -gencode arch=compute_35,code=sm_35"]
-                ]
-             )
-            ]
-        )
+#AS_IF([test "x$want_cuda" = xyes],
+#    [AS_IF([test "x$NVCCFLAGS" = x],
+#        [dnl generate CUDA code for broad spectrum of devices
+#         dnl Note: cc 13 for Tesla
+#         dnl Note: cc 20 for Fermi
+#	 dnl Note: cc 30 for Kepler K10
+#	 dnl Note: cc 35 for Kepler K20
+#         NVCCFLAGS=["-gencode arch=compute_10,code=sm_10 \
+# -gencode arch=compute_11,code=sm_11 \
+# -gencode arch=compute_13,code=sm_13 \
+# -gencode arch=compute_20,code=sm_20 \
+# -gencode arch=compute_30,code=sm_30 \
+# -gencode arch=compute_35,code=sm_35"]
+#                ]
+#             )
+#            ]
+#        )
 if test x$want_fast_math = xyes
 then
 	NVCCFLAGS+=" -use_fast_math"
diff --git a/m4/libtool.m4 b/m4/libtool.m4
index f12cfdf0b48ee2153b5e1747dc0315e9eca9d169..a644432f431895cff03337b765a7389c7fd0c221 100644
--- a/m4/libtool.m4
+++ b/m4/libtool.m4
@@ -1,8 +1,6 @@
 # libtool.m4 - Configure libtool for the host system. -*-Autoconf-*-
 #
-#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
-#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-#                 Foundation, Inc.
+#   Copyright (C) 1996-2001, 2003-2015 Free Software Foundation, Inc.
 #   Written by Gordon Matzigkeit, 1996
 #
 # This file is free software; the Free Software Foundation gives
@@ -10,36 +8,30 @@
 # modifications, as long as this notice is preserved.
 
 m4_define([_LT_COPYING], [dnl
-#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
-#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-#                 Foundation, Inc.
-#   Written by Gordon Matzigkeit, 1996
-#
-#   This file is part of GNU Libtool.
-#
-# GNU Libtool is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of
-# the License, or (at your option) any later version.
+# Copyright (C) 2014 Free Software Foundation, Inc.
+# This is free software; see the source for copying conditions.  There is NO
+# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+# GNU Libtool is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of of the License, or
+# (at your option) any later version.
 #
-# As a special exception to the GNU General Public License,
-# if you distribute this file as part of a program or library that
-# is built using GNU Libtool, you may include this file under the
-# same distribution terms that you use for the rest of that program.
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program or library that is built
+# using GNU Libtool, you may include this file under the  same
+# distribution terms that you use for the rest of that program.
 #
-# GNU Libtool is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU Libtool is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with GNU Libtool; see the file COPYING.  If not, a copy
-# can be downloaded from http://www.gnu.org/licenses/gpl.html, or
-# obtained by writing to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ])
 
-# serial 57 LT_INIT
+# serial 58 LT_INIT
 
 
 # LT_PREREQ(VERSION)
@@ -67,7 +59,7 @@ esac
 # LT_INIT([OPTIONS])
 # ------------------
 AC_DEFUN([LT_INIT],
-[AC_PREREQ([2.58])dnl We use AC_INCLUDES_DEFAULT
+[AC_PREREQ([2.62])dnl We use AC_PATH_PROGS_FEATURE_CHECK
 AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
 AC_BEFORE([$0], [LT_LANG])dnl
 AC_BEFORE([$0], [LT_OUTPUT])dnl
@@ -91,7 +83,7 @@ dnl Parse OPTIONS
 _LT_SET_OPTIONS([$0], [$1])
 
 # This can be used to rebuild libtool when needed
-LIBTOOL_DEPS="$ltmain"
+LIBTOOL_DEPS=$ltmain
 
 # Always use our own libtool.
 LIBTOOL='$(SHELL) $(top_builddir)/libtool'
@@ -111,26 +103,43 @@ dnl AC_DEFUN([AC_PROG_LIBTOOL], [])
 dnl AC_DEFUN([AM_PROG_LIBTOOL], [])
 
 
+# _LT_PREPARE_CC_BASENAME
+# -----------------------
+m4_defun([_LT_PREPARE_CC_BASENAME], [
+# Calculate cc_basename.  Skip known compiler wrappers and cross-prefix.
+func_cc_basename ()
+{
+    for cc_temp in @S|@*""; do
+      case $cc_temp in
+        compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;;
+        distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;;
+        \-*) ;;
+        *) break;;
+      esac
+    done
+    func_cc_basename_result=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+}
+])# _LT_PREPARE_CC_BASENAME
+
+
 # _LT_CC_BASENAME(CC)
 # -------------------
-# Calculate cc_basename.  Skip known compiler wrappers and cross-prefix.
+# It would be clearer to call AC_REQUIREs from _LT_PREPARE_CC_BASENAME,
+# but that macro is also expanded into generated libtool script, which
+# arranges for $SED and $ECHO to be set by different means.
 m4_defun([_LT_CC_BASENAME],
-[for cc_temp in $1""; do
-  case $cc_temp in
-    compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;;
-    distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;;
-    \-*) ;;
-    *) break;;
-  esac
-done
-cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
+[m4_require([_LT_PREPARE_CC_BASENAME])dnl
+AC_REQUIRE([_LT_DECL_SED])dnl
+AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])dnl
+func_cc_basename $1
+cc_basename=$func_cc_basename_result
 ])
 
 
 # _LT_FILEUTILS_DEFAULTS
 # ----------------------
 # It is okay to use these file commands and assume they have been set
-# sensibly after `m4_require([_LT_FILEUTILS_DEFAULTS])'.
+# sensibly after 'm4_require([_LT_FILEUTILS_DEFAULTS])'.
 m4_defun([_LT_FILEUTILS_DEFAULTS],
 [: ${CP="cp -f"}
 : ${MV="mv -f"}
@@ -177,15 +186,16 @@ m4_require([_LT_CHECK_SHAREDLIB_FROM_LINKLIB])dnl
 m4_require([_LT_CMD_OLD_ARCHIVE])dnl
 m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl
 m4_require([_LT_WITH_SYSROOT])dnl
+m4_require([_LT_CMD_TRUNCATE])dnl
 
 _LT_CONFIG_LIBTOOL_INIT([
-# See if we are running on zsh, and set the options which allow our
+# See if we are running on zsh, and set the options that allow our
 # commands through without removal of \ escapes INIT.
-if test -n "\${ZSH_VERSION+set}" ; then
+if test -n "\${ZSH_VERSION+set}"; then
    setopt NO_GLOB_SUBST
 fi
 ])
-if test -n "${ZSH_VERSION+set}" ; then
+if test -n "${ZSH_VERSION+set}"; then
    setopt NO_GLOB_SUBST
 fi
 
@@ -198,7 +208,7 @@ aix3*)
   # AIX sometimes has problems with the GCC collect2 program.  For some
   # reason, if we set the COLLECT_NAMES environment variable, the problems
   # vanish in a puff of smoke.
-  if test "X${COLLECT_NAMES+set}" != Xset; then
+  if test set != "${COLLECT_NAMES+set}"; then
     COLLECT_NAMES=
     export COLLECT_NAMES
   fi
@@ -209,14 +219,14 @@ esac
 ofile=libtool
 can_build_shared=yes
 
-# All known linkers require a `.a' archive for static linking (except MSVC,
+# All known linkers require a '.a' archive for static linking (except MSVC,
 # which needs '.lib').
 libext=a
 
-with_gnu_ld="$lt_cv_prog_gnu_ld"
+with_gnu_ld=$lt_cv_prog_gnu_ld
 
-old_CC="$CC"
-old_CFLAGS="$CFLAGS"
+old_CC=$CC
+old_CFLAGS=$CFLAGS
 
 # Set sane defaults for various variables
 test -z "$CC" && CC=cc
@@ -269,14 +279,14 @@ no_glob_subst='s/\*/\\\*/g'
 
 # _LT_PROG_LTMAIN
 # ---------------
-# Note that this code is called both from `configure', and `config.status'
+# Note that this code is called both from 'configure', and 'config.status'
 # now that we use AC_CONFIG_COMMANDS to generate libtool.  Notably,
-# `config.status' has no value for ac_aux_dir unless we are using Automake,
+# 'config.status' has no value for ac_aux_dir unless we are using Automake,
 # so we pass a copy along to make sure it has a sensible value anyway.
 m4_defun([_LT_PROG_LTMAIN],
 [m4_ifdef([AC_REQUIRE_AUX_FILE], [AC_REQUIRE_AUX_FILE([ltmain.sh])])dnl
 _LT_CONFIG_LIBTOOL_INIT([ac_aux_dir='$ac_aux_dir'])
-ltmain="$ac_aux_dir/ltmain.sh"
+ltmain=$ac_aux_dir/ltmain.sh
 ])# _LT_PROG_LTMAIN
 
 
@@ -286,7 +296,7 @@ ltmain="$ac_aux_dir/ltmain.sh"
 
 # So that we can recreate a full libtool script including additional
 # tags, we accumulate the chunks of code to send to AC_CONFIG_COMMANDS
-# in macros and then make a single call at the end using the `libtool'
+# in macros and then make a single call at the end using the 'libtool'
 # label.
 
 
@@ -421,8 +431,8 @@ m4_define([_lt_decl_all_varnames],
 
 # _LT_CONFIG_STATUS_DECLARE([VARNAME])
 # ------------------------------------
-# Quote a variable value, and forward it to `config.status' so that its
-# declaration there will have the same value as in `configure'.  VARNAME
+# Quote a variable value, and forward it to 'config.status' so that its
+# declaration there will have the same value as in 'configure'.  VARNAME
 # must have a single quote delimited value for this to work.
 m4_define([_LT_CONFIG_STATUS_DECLARE],
 [$1='`$ECHO "$][$1" | $SED "$delay_single_quote_subst"`'])
@@ -446,7 +456,7 @@ m4_defun([_LT_CONFIG_STATUS_DECLARATIONS],
 # Output comment and list of tags supported by the script
 m4_defun([_LT_LIBTOOL_TAGS],
 [_LT_FORMAT_COMMENT([The names of the tagged configurations supported by this script])dnl
-available_tags="_LT_TAGS"dnl
+available_tags='_LT_TAGS'dnl
 ])
 
 
@@ -474,7 +484,7 @@ m4_ifval([$2], [_$2])[]m4_popdef([_libtool_name])[]dnl
 # _LT_LIBTOOL_CONFIG_VARS
 # -----------------------
 # Produce commented declarations of non-tagged libtool config variables
-# suitable for insertion in the LIBTOOL CONFIG section of the `libtool'
+# suitable for insertion in the LIBTOOL CONFIG section of the 'libtool'
 # script.  Tagged libtool config variables (even for the LIBTOOL CONFIG
 # section) are produced by _LT_LIBTOOL_TAG_VARS.
 m4_defun([_LT_LIBTOOL_CONFIG_VARS],
@@ -500,8 +510,8 @@ m4_define([_LT_TAGVAR], [m4_ifval([$2], [$1_$2], [$1])])
 # Send accumulated output to $CONFIG_STATUS.  Thanks to the lists of
 # variables for single and double quote escaping we saved from calls
 # to _LT_DECL, we can put quote escaped variables declarations
-# into `config.status', and then the shell code to quote escape them in
-# for loops in `config.status'.  Finally, any additional code accumulated
+# into 'config.status', and then the shell code to quote escape them in
+# for loops in 'config.status'.  Finally, any additional code accumulated
 # from calls to _LT_CONFIG_LIBTOOL_INIT is expanded.
 m4_defun([_LT_CONFIG_COMMANDS],
 [AC_PROVIDE_IFELSE([LT_OUTPUT],
@@ -547,7 +557,7 @@ for var in lt_decl_all_varnames([[ \
 ]], lt_decl_quote_varnames); do
     case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
     *[[\\\\\\\`\\"\\\$]]*)
-      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\""
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\"" ## exclude from sc_prohibit_nested_quotes
       ;;
     *)
       eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
@@ -560,7 +570,7 @@ for var in lt_decl_all_varnames([[ \
 ]], lt_decl_dquote_varnames); do
     case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
     *[[\\\\\\\`\\"\\\$]]*)
-      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\""
+      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\"" ## exclude from sc_prohibit_nested_quotes
       ;;
     *)
       eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
@@ -576,7 +586,7 @@ _LT_OUTPUT_LIBTOOL_INIT
 # Generate a child script FILE with all initialization necessary to
 # reuse the environment learned by the parent script, and make the
 # file executable.  If COMMENT is supplied, it is inserted after the
-# `#!' sequence but before initialization text begins.  After this
+# '#!' sequence but before initialization text begins.  After this
 # macro, additional text can be appended to FILE to form the body of
 # the child script.  The macro ends with non-zero status if the
 # file could not be fully written (such as if the disk is full).
@@ -598,7 +608,7 @@ AS_SHELL_SANITIZE
 _AS_PREPARE
 exec AS_MESSAGE_FD>&1
 _ASEOF
-test $lt_write_fail = 0 && chmod +x $1[]dnl
+test 0 = "$lt_write_fail" && chmod +x $1[]dnl
 m4_popdef([AS_MESSAGE_LOG_FD])])])# _LT_GENERATED_FILE_INIT
 
 # LT_OUTPUT
@@ -621,7 +631,7 @@ exec AS_MESSAGE_LOG_FD>>config.log
 } >&AS_MESSAGE_LOG_FD
 
 lt_cl_help="\
-\`$as_me' creates a local libtool stub from the current configuration,
+'$as_me' creates a local libtool stub from the current configuration,
 for use in further configure time tests before the real libtool is
 generated.
 
@@ -643,7 +653,7 @@ Copyright (C) 2011 Free Software Foundation, Inc.
 This config.lt script is free software; the Free Software Foundation
 gives unlimited permision to copy, distribute and modify it."
 
-while test $[#] != 0
+while test 0 != $[#]
 do
   case $[1] in
     --version | --v* | -V )
@@ -656,10 +666,10 @@ do
       lt_cl_silent=: ;;
 
     -*) AC_MSG_ERROR([unrecognized option: $[1]
-Try \`$[0] --help' for more information.]) ;;
+Try '$[0] --help' for more information.]) ;;
 
     *) AC_MSG_ERROR([unrecognized argument: $[1]
-Try \`$[0] --help' for more information.]) ;;
+Try '$[0] --help' for more information.]) ;;
   esac
   shift
 done
@@ -685,7 +695,7 @@ chmod +x "$CONFIG_LT"
 # open by configure.  Here we exec the FD to /dev/null, effectively closing
 # config.log, so it can be properly (re)opened and appended to by config.lt.
 lt_cl_success=:
-test "$silent" = yes &&
+test yes = "$silent" &&
   lt_config_lt_args="$lt_config_lt_args --quiet"
 exec AS_MESSAGE_LOG_FD>/dev/null
 $SHELL "$CONFIG_LT" $lt_config_lt_args || lt_cl_success=false
@@ -705,32 +715,47 @@ m4_defun([_LT_CONFIG],
 _LT_CONFIG_SAVE_COMMANDS([
   m4_define([_LT_TAG], m4_if([$1], [], [C], [$1]))dnl
   m4_if(_LT_TAG, [C], [
-    # See if we are running on zsh, and set the options which allow our
+    # See if we are running on zsh, and set the options that allow our
     # commands through without removal of \ escapes.
-    if test -n "${ZSH_VERSION+set}" ; then
+    if test -n "${ZSH_VERSION+set}"; then
       setopt NO_GLOB_SUBST
     fi
 
-    cfgfile="${ofile}T"
+    cfgfile=${ofile}T
     trap "$RM \"$cfgfile\"; exit 1" 1 2 15
     $RM "$cfgfile"
 
     cat <<_LT_EOF >> "$cfgfile"
 #! $SHELL
-
-# `$ECHO "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
-# Generated automatically by $as_me ($PACKAGE$TIMESTAMP) $VERSION
+# Generated automatically by $as_me ($PACKAGE) $VERSION
 # Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
 # NOTE: Changes made to this file will be lost: look at ltmain.sh.
-#
+
+# Provide generalized library-building support services.
+# Written by Gordon Matzigkeit, 1996
+
 _LT_COPYING
 _LT_LIBTOOL_TAGS
 
+# Configured defaults for sys_lib_dlsearch_path munging.
+: \${LT_SYS_LIBRARY_PATH="$configure_time_lt_sys_library_path"}
+
 # ### BEGIN LIBTOOL CONFIG
 _LT_LIBTOOL_CONFIG_VARS
 _LT_LIBTOOL_TAG_VARS
 # ### END LIBTOOL CONFIG
 
+_LT_EOF
+
+    cat <<'_LT_EOF' >> "$cfgfile"
+
+# ### BEGIN FUNCTIONS SHARED WITH CONFIGURE
+
+_LT_PREPARE_MUNGE_PATH_LIST
+_LT_PREPARE_CC_BASENAME
+
+# ### END FUNCTIONS SHARED WITH CONFIGURE
+
 _LT_EOF
 
   case $host_os in
@@ -739,7 +764,7 @@ _LT_EOF
 # AIX sometimes has problems with the GCC collect2 program.  For some
 # reason, if we set the COLLECT_NAMES environment variable, the problems
 # vanish in a puff of smoke.
-if test "X${COLLECT_NAMES+set}" != Xset; then
+if test set != "${COLLECT_NAMES+set}"; then
   COLLECT_NAMES=
   export COLLECT_NAMES
 fi
@@ -756,8 +781,6 @@ _LT_EOF
   sed '$q' "$ltmain" >> "$cfgfile" \
      || (rm -f "$cfgfile"; exit 1)
 
-  _LT_PROG_REPLACE_SHELLFNS
-
    mv -f "$cfgfile" "$ofile" ||
     (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
   chmod +x "$ofile"
@@ -775,7 +798,6 @@ _LT_EOF
 [m4_if([$1], [], [
     PACKAGE='$PACKAGE'
     VERSION='$VERSION'
-    TIMESTAMP='$TIMESTAMP'
     RM='$RM'
     ofile='$ofile'], [])
 ])dnl /_LT_CONFIG_SAVE_COMMANDS
@@ -974,7 +996,7 @@ m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[
 
     AC_CACHE_CHECK([for -single_module linker flag],[lt_cv_apple_cc_single_mod],
       [lt_cv_apple_cc_single_mod=no
-      if test -z "${LT_MULTI_MODULE}"; then
+      if test -z "$LT_MULTI_MODULE"; then
 	# By default we will add the -single_module flag. You can override
 	# by either setting the environment variable LT_MULTI_MODULE
 	# non-empty at configure time, or by adding -multi_module to the
@@ -992,7 +1014,7 @@ m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[
 	  cat conftest.err >&AS_MESSAGE_LOG_FD
 	# Otherwise, if the output was created with a 0 exit code from
 	# the compiler, it worked.
-	elif test -f libconftest.dylib && test $_lt_result -eq 0; then
+	elif test -f libconftest.dylib && test 0 = "$_lt_result"; then
 	  lt_cv_apple_cc_single_mod=yes
 	else
 	  cat conftest.err >&AS_MESSAGE_LOG_FD
@@ -1010,7 +1032,7 @@ m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[
       AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],
 	[lt_cv_ld_exported_symbols_list=yes],
 	[lt_cv_ld_exported_symbols_list=no])
-	LDFLAGS="$save_LDFLAGS"
+	LDFLAGS=$save_LDFLAGS
     ])
 
     AC_CACHE_CHECK([for -force_load linker flag],[lt_cv_ld_force_load],
@@ -1032,7 +1054,7 @@ _LT_EOF
       _lt_result=$?
       if test -s conftest.err && $GREP force_load conftest.err; then
 	cat conftest.err >&AS_MESSAGE_LOG_FD
-      elif test -f conftest && test $_lt_result -eq 0 && $GREP forced_load conftest >/dev/null 2>&1 ; then
+      elif test -f conftest && test 0 = "$_lt_result" && $GREP forced_load conftest >/dev/null 2>&1; then
 	lt_cv_ld_force_load=yes
       else
 	cat conftest.err >&AS_MESSAGE_LOG_FD
@@ -1042,32 +1064,32 @@ _LT_EOF
     ])
     case $host_os in
     rhapsody* | darwin1.[[012]])
-      _lt_dar_allow_undefined='${wl}-undefined ${wl}suppress' ;;
+      _lt_dar_allow_undefined='$wl-undefined ${wl}suppress' ;;
     darwin1.*)
-      _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+      _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
     darwin*) # darwin 5.x on
       # if running on 10.5 or later, the deployment target defaults
       # to the OS version, if on x86, and 10.4, the deployment
       # target defaults to 10.4. Don't you love it?
       case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
 	10.0,*86*-darwin8*|10.0,*-darwin[[91]]*)
-	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
-	10.[[012]]*)
-	  _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
+	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
+	10.[[012]][[,.]]*)
+	  _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;;
 	10.*)
-	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
+	  _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;;
       esac
     ;;
   esac
-    if test "$lt_cv_apple_cc_single_mod" = "yes"; then
+    if test yes = "$lt_cv_apple_cc_single_mod"; then
       _lt_dar_single_mod='$single_module'
     fi
-    if test "$lt_cv_ld_exported_symbols_list" = "yes"; then
-      _lt_dar_export_syms=' ${wl}-exported_symbols_list,$output_objdir/${libname}-symbols.expsym'
+    if test yes = "$lt_cv_ld_exported_symbols_list"; then
+      _lt_dar_export_syms=' $wl-exported_symbols_list,$output_objdir/$libname-symbols.expsym'
     else
-      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/${libname}-symbols.expsym ${lib}'
+      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/$libname-symbols.expsym $lib'
     fi
-    if test "$DSYMUTIL" != ":" && test "$lt_cv_ld_force_load" = "no"; then
+    if test : != "$DSYMUTIL" && test no = "$lt_cv_ld_force_load"; then
       _lt_dsymutil='~$DSYMUTIL $lib || :'
     else
       _lt_dsymutil=
@@ -1087,29 +1109,29 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES],
   _LT_TAGVAR(hardcode_direct, $1)=no
   _LT_TAGVAR(hardcode_automatic, $1)=yes
   _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
-  if test "$lt_cv_ld_force_load" = "yes"; then
-    _LT_TAGVAR(whole_archive_flag_spec, $1)='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
+  if test yes = "$lt_cv_ld_force_load"; then
+    _LT_TAGVAR(whole_archive_flag_spec, $1)='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience $wl-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
     m4_case([$1], [F77], [_LT_TAGVAR(compiler_needs_object, $1)=yes],
                   [FC],  [_LT_TAGVAR(compiler_needs_object, $1)=yes])
   else
     _LT_TAGVAR(whole_archive_flag_spec, $1)=''
   fi
   _LT_TAGVAR(link_all_deplibs, $1)=yes
-  _LT_TAGVAR(allow_undefined_flag, $1)="$_lt_dar_allow_undefined"
+  _LT_TAGVAR(allow_undefined_flag, $1)=$_lt_dar_allow_undefined
   case $cc_basename in
-     ifort*) _lt_dar_can_shared=yes ;;
+     ifort*|nagfor*) _lt_dar_can_shared=yes ;;
      *) _lt_dar_can_shared=$GCC ;;
   esac
-  if test "$_lt_dar_can_shared" = "yes"; then
+  if test yes = "$_lt_dar_can_shared"; then
     output_verbose_link_cmd=func_echo_all
-    _LT_TAGVAR(archive_cmds, $1)="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
-    _LT_TAGVAR(module_cmds, $1)="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
-    _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
-    _LT_TAGVAR(module_expsym_cmds, $1)="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
+    _LT_TAGVAR(archive_cmds, $1)="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod$_lt_dsymutil"
+    _LT_TAGVAR(module_cmds, $1)="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dsymutil"
+    _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod$_lt_dar_export_syms$_lt_dsymutil"
+    _LT_TAGVAR(module_expsym_cmds, $1)="sed -e 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dar_export_syms$_lt_dsymutil"
     m4_if([$1], [CXX],
-[   if test "$lt_cv_apple_cc_single_mod" != "yes"; then
-      _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
-      _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
+[   if test yes != "$lt_cv_apple_cc_single_mod"; then
+      _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil"
+      _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil"
     fi
 ],[])
   else
@@ -1129,7 +1151,7 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES],
 # Allow to override them for all tags through lt_cv_aix_libpath.
 m4_defun([_LT_SYS_MODULE_PATH_AIX],
 [m4_require([_LT_DECL_SED])dnl
-if test "${lt_cv_aix_libpath+set}" = set; then
+if test set = "${lt_cv_aix_libpath+set}"; then
   aix_libpath=$lt_cv_aix_libpath
 else
   AC_CACHE_VAL([_LT_TAGVAR([lt_cv_aix_libpath_], [$1])],
@@ -1147,7 +1169,7 @@ else
     _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
   fi],[])
   if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then
-    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])="/usr/lib:/lib"
+    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=/usr/lib:/lib
   fi
   ])
   aix_libpath=$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])
@@ -1167,8 +1189,8 @@ m4_define([_LT_SHELL_INIT],
 # -----------------------
 # Find how we can fake an echo command that does not interpret backslash.
 # In particular, with Autoconf 2.60 or later we add some code to the start
-# of the generated configure script which will find a shell with a builtin
-# printf (which we can use as an echo command).
+# of the generated configure script that will find a shell with a builtin
+# printf (that we can use as an echo command).
 m4_defun([_LT_PROG_ECHO_BACKSLASH],
 [ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
 ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
@@ -1196,10 +1218,10 @@ fi
 # Invoke $ECHO with all args, space-separated.
 func_echo_all ()
 {
-    $ECHO "$*" 
+    $ECHO "$*"
 }
 
-case "$ECHO" in
+case $ECHO in
   printf*) AC_MSG_RESULT([printf]) ;;
   print*) AC_MSG_RESULT([print -r]) ;;
   *) AC_MSG_RESULT([cat]) ;;
@@ -1225,16 +1247,17 @@ _LT_DECL([], [ECHO], [1], [An echo program that protects backslashes])
 AC_DEFUN([_LT_WITH_SYSROOT],
 [AC_MSG_CHECKING([for sysroot])
 AC_ARG_WITH([sysroot],
-[  --with-sysroot[=DIR] Search for dependent libraries within DIR
-                        (or the compiler's sysroot if not specified).],
+[AS_HELP_STRING([--with-sysroot@<:@=DIR@:>@],
+  [Search for dependent libraries within DIR (or the compiler's sysroot
+   if not specified).])],
 [], [with_sysroot=no])
 
 dnl lt_sysroot will always be passed unquoted.  We quote it here
 dnl in case the user passed a directory name.
 lt_sysroot=
-case ${with_sysroot} in #(
+case $with_sysroot in #(
  yes)
-   if test "$GCC" = yes; then
+   if test yes = "$GCC"; then
      lt_sysroot=`$CC --print-sysroot 2>/dev/null`
    fi
    ;; #(
@@ -1244,14 +1267,14 @@ case ${with_sysroot} in #(
  no|'')
    ;; #(
  *)
-   AC_MSG_RESULT([${with_sysroot}])
+   AC_MSG_RESULT([$with_sysroot])
    AC_MSG_ERROR([The sysroot must be an absolute path.])
    ;;
 esac
 
  AC_MSG_RESULT([${lt_sysroot:-no}])
 _LT_DECL([], [lt_sysroot], [0], [The root where to search for ]dnl
-[dependent libraries, and in which our libraries should be installed.])])
+[dependent libraries, and where our libraries should be installed.])])
 
 # _LT_ENABLE_LOCK
 # ---------------
@@ -1259,31 +1282,33 @@ m4_defun([_LT_ENABLE_LOCK],
 [AC_ARG_ENABLE([libtool-lock],
   [AS_HELP_STRING([--disable-libtool-lock],
     [avoid locking (might break parallel builds)])])
-test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
+test no = "$enable_libtool_lock" || enable_libtool_lock=yes
 
 # Some flags need to be propagated to the compiler or linker for good
 # libtool support.
 case $host in
 ia64-*-hpux*)
-  # Find out which ABI we are using.
+  # Find out what ABI is being produced by ac_compile, and set mode
+  # options accordingly.
   echo 'int i;' > conftest.$ac_ext
   if AC_TRY_EVAL(ac_compile); then
     case `/usr/bin/file conftest.$ac_objext` in
       *ELF-32*)
-	HPUX_IA64_MODE="32"
+	HPUX_IA64_MODE=32
 	;;
       *ELF-64*)
-	HPUX_IA64_MODE="64"
+	HPUX_IA64_MODE=64
 	;;
     esac
   fi
   rm -rf conftest*
   ;;
 *-*-irix6*)
-  # Find out which ABI we are using.
+  # Find out what ABI is being produced by ac_compile, and set linker
+  # options accordingly.
   echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext
   if AC_TRY_EVAL(ac_compile); then
-    if test "$lt_cv_prog_gnu_ld" = yes; then
+    if test yes = "$lt_cv_prog_gnu_ld"; then
       case `/usr/bin/file conftest.$ac_objext` in
 	*32-bit*)
 	  LD="${LD-ld} -melf32bsmip"
@@ -1312,9 +1337,46 @@ ia64-*-hpux*)
   rm -rf conftest*
   ;;
 
+mips64*-*linux*)
+  # Find out what ABI is being produced by ac_compile, and set linker
+  # options accordingly.
+  echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    emul=elf
+    case `/usr/bin/file conftest.$ac_objext` in
+      *32-bit*)
+	emul="${emul}32"
+	;;
+      *64-bit*)
+	emul="${emul}64"
+	;;
+    esac
+    case `/usr/bin/file conftest.$ac_objext` in
+      *MSB*)
+	emul="${emul}btsmip"
+	;;
+      *LSB*)
+	emul="${emul}ltsmip"
+	;;
+    esac
+    case `/usr/bin/file conftest.$ac_objext` in
+      *N32*)
+	emul="${emul}n32"
+	;;
+    esac
+    LD="${LD-ld} -m $emul"
+  fi
+  rm -rf conftest*
+  ;;
+
 x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
 s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
-  # Find out which ABI we are using.
+  # Find out what ABI is being produced by ac_compile, and set linker
+  # options accordingly.  Note that the listed cases only cover the
+  # situations where additional linker options are needed (such as when
+  # doing 32-bit compilation for a host where ld defaults to 64-bit, or
+  # vice versa); the common cases where no linker options are needed do
+  # not appear in the list.
   echo 'int i;' > conftest.$ac_ext
   if AC_TRY_EVAL(ac_compile); then
     case `/usr/bin/file conftest.o` in
@@ -1324,7 +1386,14 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
 	    LD="${LD-ld} -m elf_i386_fbsd"
 	    ;;
 	  x86_64-*linux*)
-	    LD="${LD-ld} -m elf_i386"
+	    case `/usr/bin/file conftest.o` in
+	      *x86-64*)
+		LD="${LD-ld} -m elf32_x86_64"
+		;;
+	      *)
+		LD="${LD-ld} -m elf_i386"
+		;;
+	    esac
 	    ;;
 	  powerpc64le-*linux*)
 	    LD="${LD-ld} -m elf32lppclinux"
@@ -1369,19 +1438,20 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
 
 *-*-sco3.2v5*)
   # On SCO OpenServer 5, we need -belf to get full-featured binaries.
-  SAVE_CFLAGS="$CFLAGS"
+  SAVE_CFLAGS=$CFLAGS
   CFLAGS="$CFLAGS -belf"
   AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
     [AC_LANG_PUSH(C)
      AC_LINK_IFELSE([AC_LANG_PROGRAM([[]],[[]])],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
      AC_LANG_POP])
-  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+  if test yes != "$lt_cv_cc_needs_belf"; then
     # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
-    CFLAGS="$SAVE_CFLAGS"
+    CFLAGS=$SAVE_CFLAGS
   fi
   ;;
 *-*solaris*)
-  # Find out which ABI we are using.
+  # Find out what ABI is being produced by ac_compile, and set linker
+  # options accordingly.
   echo 'int i;' > conftest.$ac_ext
   if AC_TRY_EVAL(ac_compile); then
     case `/usr/bin/file conftest.o` in
@@ -1389,7 +1459,7 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
       case $lt_cv_prog_gnu_ld in
       yes*)
         case $host in
-        i?86-*-solaris*)
+        i?86-*-solaris*|x86_64-*-solaris*)
           LD="${LD-ld} -m elf_x86_64"
           ;;
         sparc*-*-solaris*)
@@ -1398,7 +1468,7 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
         esac
         # GNU ld 2.21 introduced _sol2 emulations.  Use them if available.
         if ${LD-ld} -V | grep _sol2 >/dev/null 2>&1; then
-          LD="${LD-ld}_sol2"
+          LD=${LD-ld}_sol2
         fi
         ;;
       *)
@@ -1414,7 +1484,7 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
   ;;
 esac
 
-need_locks="$enable_libtool_lock"
+need_locks=$enable_libtool_lock
 ])# _LT_ENABLE_LOCK
 
 
@@ -1433,11 +1503,11 @@ AC_CACHE_CHECK([for archiver @FILE support], [lt_cv_ar_at_file],
      [echo conftest.$ac_objext > conftest.lst
       lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&AS_MESSAGE_LOG_FD'
       AC_TRY_EVAL([lt_ar_try])
-      if test "$ac_status" -eq 0; then
+      if test 0 -eq "$ac_status"; then
 	# Ensure the archiver fails upon bogus file names.
 	rm -f conftest.$ac_objext libconftest.a
 	AC_TRY_EVAL([lt_ar_try])
-	if test "$ac_status" -ne 0; then
+	if test 0 -ne "$ac_status"; then
           lt_cv_ar_at_file=@
         fi
       fi
@@ -1445,7 +1515,7 @@ AC_CACHE_CHECK([for archiver @FILE support], [lt_cv_ar_at_file],
      ])
   ])
 
-if test "x$lt_cv_ar_at_file" = xno; then
+if test no = "$lt_cv_ar_at_file"; then
   archiver_list_spec=
 else
   archiver_list_spec=$lt_cv_ar_at_file
@@ -1476,7 +1546,7 @@ old_postuninstall_cmds=
 
 if test -n "$RANLIB"; then
   case $host_os in
-  openbsd*)
+  bitrig* | openbsd*)
     old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$tool_oldlib"
     ;;
   *)
@@ -1512,7 +1582,7 @@ AC_CACHE_CHECK([$1], [$2],
   [$2=no
    m4_if([$4], , [ac_outfile=conftest.$ac_objext], [ac_outfile=$4])
    echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-   lt_compiler_flag="$3"
+   lt_compiler_flag="$3"  ## exclude from sc_useless_quotes_in_assignment
    # Insert the option either (1) after the last *FLAGS variable, or
    # (2) before a word containing "conftest.", or (3) at the end.
    # Note that $ac_compile itself does not contain backslashes and begins
@@ -1539,7 +1609,7 @@ AC_CACHE_CHECK([$1], [$2],
    $RM conftest*
 ])
 
-if test x"[$]$2" = xyes; then
+if test yes = "[$]$2"; then
     m4_if([$5], , :, [$5])
 else
     m4_if([$6], , :, [$6])
@@ -1561,7 +1631,7 @@ AC_DEFUN([_LT_LINKER_OPTION],
 m4_require([_LT_DECL_SED])dnl
 AC_CACHE_CHECK([$1], [$2],
   [$2=no
-   save_LDFLAGS="$LDFLAGS"
+   save_LDFLAGS=$LDFLAGS
    LDFLAGS="$LDFLAGS $3"
    echo "$lt_simple_link_test_code" > conftest.$ac_ext
    if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
@@ -1580,10 +1650,10 @@ AC_CACHE_CHECK([$1], [$2],
      fi
    fi
    $RM -r conftest*
-   LDFLAGS="$save_LDFLAGS"
+   LDFLAGS=$save_LDFLAGS
 ])
 
-if test x"[$]$2" = xyes; then
+if test yes = "[$]$2"; then
     m4_if([$4], , :, [$4])
 else
     m4_if([$5], , :, [$5])
@@ -1604,7 +1674,7 @@ AC_DEFUN([LT_CMD_MAX_LEN],
 AC_MSG_CHECKING([the maximum length of command line arguments])
 AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
   i=0
-  teststring="ABCD"
+  teststring=ABCD
 
   case $build_os in
   msdosdjgpp*)
@@ -1644,7 +1714,7 @@ AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
     lt_cv_sys_max_cmd_len=8192;
     ;;
 
-  netbsd* | freebsd* | openbsd* | darwin* | dragonfly*)
+  bitrig* | darwin* | dragonfly* | freebsd* | netbsd* | openbsd*)
     # This has been around since 386BSD, at least.  Likely further.
     if test -x /sbin/sysctl; then
       lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax`
@@ -1694,22 +1764,23 @@ AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
     ;;
   *)
     lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
-    if test -n "$lt_cv_sys_max_cmd_len"; then
+    if test -n "$lt_cv_sys_max_cmd_len" && \
+       test undefined != "$lt_cv_sys_max_cmd_len"; then
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
     else
       # Make teststring a little bigger before we do anything with it.
       # a 1K string should be a reasonable start.
-      for i in 1 2 3 4 5 6 7 8 ; do
+      for i in 1 2 3 4 5 6 7 8; do
         teststring=$teststring$teststring
       done
       SHELL=${SHELL-${CONFIG_SHELL-/bin/sh}}
       # If test is not a shell built-in, we'll probably end up computing a
       # maximum length that is only half of the actual maximum length, but
       # we can't tell.
-      while { test "X"`env echo "$teststring$teststring" 2>/dev/null` \
+      while { test X`env echo "$teststring$teststring" 2>/dev/null` \
 	         = "X$teststring$teststring"; } >/dev/null 2>&1 &&
-	      test $i != 17 # 1/2 MB should be enough
+	      test 17 != "$i" # 1/2 MB should be enough
       do
         i=`expr $i + 1`
         teststring=$teststring$teststring
@@ -1725,7 +1796,7 @@ AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
     ;;
   esac
 ])
-if test -n $lt_cv_sys_max_cmd_len ; then
+if test -n "$lt_cv_sys_max_cmd_len"; then
   AC_MSG_RESULT($lt_cv_sys_max_cmd_len)
 else
   AC_MSG_RESULT(none)
@@ -1753,7 +1824,7 @@ m4_defun([_LT_HEADER_DLFCN],
 # ----------------------------------------------------------------
 m4_defun([_LT_TRY_DLOPEN_SELF],
 [m4_require([_LT_HEADER_DLFCN])dnl
-if test "$cross_compiling" = yes; then :
+if test yes = "$cross_compiling"; then :
   [$4]
 else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
@@ -1800,9 +1871,9 @@ else
 #  endif
 #endif
 
-/* When -fvisbility=hidden is used, assume the code has been annotated
+/* When -fvisibility=hidden is used, assume the code has been annotated
    correspondingly for the symbols needed.  */
-#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
+#if defined __GNUC__ && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
 int fnord () __attribute__((visibility("default")));
 #endif
 
@@ -1828,7 +1899,7 @@ int main ()
   return status;
 }]
 _LT_EOF
-  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext} 2>/dev/null; then
+  if AC_TRY_EVAL(ac_link) && test -s "conftest$ac_exeext" 2>/dev/null; then
     (./conftest; exit; ) >&AS_MESSAGE_LOG_FD 2>/dev/null
     lt_status=$?
     case x$lt_status in
@@ -1849,7 +1920,7 @@ rm -fr conftest*
 # ------------------
 AC_DEFUN([LT_SYS_DLOPEN_SELF],
 [m4_require([_LT_HEADER_DLFCN])dnl
-if test "x$enable_dlopen" != xyes; then
+if test yes != "$enable_dlopen"; then
   enable_dlopen=unknown
   enable_dlopen_self=unknown
   enable_dlopen_self_static=unknown
@@ -1859,44 +1930,52 @@ else
 
   case $host_os in
   beos*)
-    lt_cv_dlopen="load_add_on"
+    lt_cv_dlopen=load_add_on
     lt_cv_dlopen_libs=
     lt_cv_dlopen_self=yes
     ;;
 
   mingw* | pw32* | cegcc*)
-    lt_cv_dlopen="LoadLibrary"
+    lt_cv_dlopen=LoadLibrary
     lt_cv_dlopen_libs=
     ;;
 
   cygwin*)
-    lt_cv_dlopen="dlopen"
+    lt_cv_dlopen=dlopen
     lt_cv_dlopen_libs=
     ;;
 
   darwin*)
-  # if libdl is installed we need to link against it
+    # if libdl is installed we need to link against it
     AC_CHECK_LIB([dl], [dlopen],
-		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],[
-    lt_cv_dlopen="dyld"
+		[lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-ldl],[
+    lt_cv_dlopen=dyld
     lt_cv_dlopen_libs=
     lt_cv_dlopen_self=yes
     ])
     ;;
 
+  tpf*)
+    # Don't try to run any link tests for TPF.  We know it's impossible
+    # because TPF is a cross-compiler, and we know how we open DSOs.
+    lt_cv_dlopen=dlopen
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=no
+    ;;
+
   *)
     AC_CHECK_FUNC([shl_load],
-	  [lt_cv_dlopen="shl_load"],
+	  [lt_cv_dlopen=shl_load],
       [AC_CHECK_LIB([dld], [shl_load],
-	    [lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"],
+	    [lt_cv_dlopen=shl_load lt_cv_dlopen_libs=-ldld],
 	[AC_CHECK_FUNC([dlopen],
-	      [lt_cv_dlopen="dlopen"],
+	      [lt_cv_dlopen=dlopen],
 	  [AC_CHECK_LIB([dl], [dlopen],
-		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],
+		[lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-ldl],
 	    [AC_CHECK_LIB([svld], [dlopen],
-		  [lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"],
+		  [lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-lsvld],
 	      [AC_CHECK_LIB([dld], [dld_link],
-		    [lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"])
+		    [lt_cv_dlopen=dld_link lt_cv_dlopen_libs=-ldld])
 	      ])
 	    ])
 	  ])
@@ -1905,21 +1984,21 @@ else
     ;;
   esac
 
-  if test "x$lt_cv_dlopen" != xno; then
-    enable_dlopen=yes
-  else
+  if test no = "$lt_cv_dlopen"; then
     enable_dlopen=no
+  else
+    enable_dlopen=yes
   fi
 
   case $lt_cv_dlopen in
   dlopen)
-    save_CPPFLAGS="$CPPFLAGS"
-    test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+    save_CPPFLAGS=$CPPFLAGS
+    test yes = "$ac_cv_header_dlfcn_h" && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
 
-    save_LDFLAGS="$LDFLAGS"
+    save_LDFLAGS=$LDFLAGS
     wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
 
-    save_LIBS="$LIBS"
+    save_LIBS=$LIBS
     LIBS="$lt_cv_dlopen_libs $LIBS"
 
     AC_CACHE_CHECK([whether a program can dlopen itself],
@@ -1929,7 +2008,7 @@ else
 	    lt_cv_dlopen_self=no, lt_cv_dlopen_self=cross)
     ])
 
-    if test "x$lt_cv_dlopen_self" = xyes; then
+    if test yes = "$lt_cv_dlopen_self"; then
       wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $lt_prog_compiler_static\"
       AC_CACHE_CHECK([whether a statically linked program can dlopen itself],
 	  lt_cv_dlopen_self_static, [dnl
@@ -1939,9 +2018,9 @@ else
       ])
     fi
 
-    CPPFLAGS="$save_CPPFLAGS"
-    LDFLAGS="$save_LDFLAGS"
-    LIBS="$save_LIBS"
+    CPPFLAGS=$save_CPPFLAGS
+    LDFLAGS=$save_LDFLAGS
+    LIBS=$save_LIBS
     ;;
   esac
 
@@ -2033,8 +2112,8 @@ m4_defun([_LT_COMPILER_FILE_LOCKS],
 m4_require([_LT_FILEUTILS_DEFAULTS])dnl
 _LT_COMPILER_C_O([$1])
 
-hard_links="nottested"
-if test "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" = no && test "$need_locks" != no; then
+hard_links=nottested
+if test no = "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" && test no != "$need_locks"; then
   # do not overwrite the value of need_locks provided by the user
   AC_MSG_CHECKING([if we can lock with hard links])
   hard_links=yes
@@ -2044,8 +2123,8 @@ if test "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" = no && test "$need_locks" !=
   ln conftest.a conftest.b 2>&5 || hard_links=no
   ln conftest.a conftest.b 2>/dev/null && hard_links=no
   AC_MSG_RESULT([$hard_links])
-  if test "$hard_links" = no; then
-    AC_MSG_WARN([`$CC' does not support `-c -o', so `make -j' may be unsafe])
+  if test no = "$hard_links"; then
+    AC_MSG_WARN(['$CC' does not support '-c -o', so 'make -j' may be unsafe])
     need_locks=warn
   fi
 else
@@ -2072,8 +2151,8 @@ objdir=$lt_cv_objdir
 _LT_DECL([], [objdir], [0],
          [The name of the directory that contains temporary libtool files])dnl
 m4_pattern_allow([LT_OBJDIR])dnl
-AC_DEFINE_UNQUOTED(LT_OBJDIR, "$lt_cv_objdir/",
-  [Define to the sub-directory in which libtool stores uninstalled libraries.])
+AC_DEFINE_UNQUOTED([LT_OBJDIR], "$lt_cv_objdir/",
+  [Define to the sub-directory where libtool stores uninstalled libraries.])
 ])# _LT_CHECK_OBJDIR
 
 
@@ -2085,15 +2164,15 @@ m4_defun([_LT_LINKER_HARDCODE_LIBPATH],
 _LT_TAGVAR(hardcode_action, $1)=
 if test -n "$_LT_TAGVAR(hardcode_libdir_flag_spec, $1)" ||
    test -n "$_LT_TAGVAR(runpath_var, $1)" ||
-   test "X$_LT_TAGVAR(hardcode_automatic, $1)" = "Xyes" ; then
+   test yes = "$_LT_TAGVAR(hardcode_automatic, $1)"; then
 
   # We can hardcode non-existent directories.
-  if test "$_LT_TAGVAR(hardcode_direct, $1)" != no &&
+  if test no != "$_LT_TAGVAR(hardcode_direct, $1)" &&
      # If the only mechanism to avoid hardcoding is shlibpath_var, we
      # have to relink, otherwise we might link with an installed library
      # when we should be linking with a yet-to-be-installed one
-     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, $1)" != no &&
-     test "$_LT_TAGVAR(hardcode_minus_L, $1)" != no; then
+     ## test no != "$_LT_TAGVAR(hardcode_shlibpath_var, $1)" &&
+     test no != "$_LT_TAGVAR(hardcode_minus_L, $1)"; then
     # Linking always hardcodes the temporary library directory.
     _LT_TAGVAR(hardcode_action, $1)=relink
   else
@@ -2107,12 +2186,12 @@ else
 fi
 AC_MSG_RESULT([$_LT_TAGVAR(hardcode_action, $1)])
 
-if test "$_LT_TAGVAR(hardcode_action, $1)" = relink ||
-   test "$_LT_TAGVAR(inherit_rpath, $1)" = yes; then
+if test relink = "$_LT_TAGVAR(hardcode_action, $1)" ||
+   test yes = "$_LT_TAGVAR(inherit_rpath, $1)"; then
   # Fast installation is not supported
   enable_fast_install=no
-elif test "$shlibpath_overrides_runpath" = yes ||
-     test "$enable_shared" = no; then
+elif test yes = "$shlibpath_overrides_runpath" ||
+     test no = "$enable_shared"; then
   # Fast installation is not necessary
   enable_fast_install=needless
 fi
@@ -2136,7 +2215,7 @@ else
 # FIXME - insert some real tests, host_os isn't really good enough
   case $host_os in
   darwin*)
-    if test -n "$STRIP" ; then
+    if test -n "$STRIP"; then
       striplib="$STRIP -x"
       old_striplib="$STRIP -S"
       AC_MSG_RESULT([yes])
@@ -2154,6 +2233,47 @@ _LT_DECL([], [striplib], [1])
 ])# _LT_CMD_STRIPLIB
 
 
+# _LT_PREPARE_MUNGE_PATH_LIST
+# ---------------------------
+# Make sure func_munge_path_list() is defined correctly.
+m4_defun([_LT_PREPARE_MUNGE_PATH_LIST],
+[[# func_munge_path_list VARIABLE PATH
+# -----------------------------------
+# VARIABLE is name of variable containing _space_ separated list of
+# directories to be munged by the contents of PATH, which is string
+# having a format:
+# "DIR[:DIR]:"
+#       string "DIR[ DIR]" will be prepended to VARIABLE
+# ":DIR[:DIR]"
+#       string "DIR[ DIR]" will be appended to VARIABLE
+# "DIRP[:DIRP]::[DIRA:]DIRA"
+#       string "DIRP[ DIRP]" will be prepended to VARIABLE and string
+#       "DIRA[ DIRA]" will be appended to VARIABLE
+# "DIR[:DIR]"
+#       VARIABLE will be replaced by "DIR[ DIR]"
+func_munge_path_list ()
+{
+    case x@S|@2 in
+    x)
+        ;;
+    *:)
+        eval @S|@1=\"`$ECHO @S|@2 | $SED 's/:/ /g'` \@S|@@S|@1\"
+        ;;
+    x:*)
+        eval @S|@1=\"\@S|@@S|@1 `$ECHO @S|@2 | $SED 's/:/ /g'`\"
+        ;;
+    *::*)
+        eval @S|@1=\"\@S|@@S|@1\ `$ECHO @S|@2 | $SED -e 's/.*:://' -e 's/:/ /g'`\"
+        eval @S|@1=\"`$ECHO @S|@2 | $SED -e 's/::.*//' -e 's/:/ /g'`\ \@S|@@S|@1\"
+        ;;
+    *)
+        eval @S|@1=\"`$ECHO @S|@2 | $SED 's/:/ /g'`\"
+        ;;
+    esac
+}
+]])# _LT_PREPARE_PATH_LIST
+
+
 # _LT_SYS_DYNAMIC_LINKER([TAG])
 # -----------------------------
 # PORTME Fill in your ld.so characteristics
@@ -2164,17 +2284,18 @@ m4_require([_LT_FILEUTILS_DEFAULTS])dnl
 m4_require([_LT_DECL_OBJDUMP])dnl
 m4_require([_LT_DECL_SED])dnl
 m4_require([_LT_CHECK_SHELL_FEATURES])dnl
+m4_require([_LT_PREPARE_MUNGE_PATH_LIST])dnl
 AC_MSG_CHECKING([dynamic linker characteristics])
 m4_if([$1],
 	[], [
-if test "$GCC" = yes; then
+if test yes = "$GCC"; then
   case $host_os in
-    darwin*) lt_awk_arg="/^libraries:/,/LR/" ;;
-    *) lt_awk_arg="/^libraries:/" ;;
+    darwin*) lt_awk_arg='/^libraries:/,/LR/' ;;
+    *) lt_awk_arg='/^libraries:/' ;;
   esac
   case $host_os in
-    mingw* | cegcc*) lt_sed_strip_eq="s,=\([[A-Za-z]]:\),\1,g" ;;
-    *) lt_sed_strip_eq="s,=/,/,g" ;;
+    mingw* | cegcc*) lt_sed_strip_eq='s|=\([[A-Za-z]]:\)|\1|g' ;;
+    *) lt_sed_strip_eq='s|=/|/|g' ;;
   esac
   lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq`
   case $lt_search_path_spec in
@@ -2190,28 +2311,35 @@ if test "$GCC" = yes; then
     ;;
   esac
   # Ok, now we have the path, separated by spaces, we can step through it
-  # and add multilib dir if necessary.
+  # and add multilib dir if necessary...
   lt_tmp_lt_search_path_spec=
-  lt_multi_os_dir=`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null`
+  lt_multi_os_dir=/`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null`
+  # ...but if some path component already ends with the multilib dir we assume
+  # that all is fine and trust -print-search-dirs as is (GCC 4.2? or newer).
+  case "$lt_multi_os_dir; $lt_search_path_spec " in
+  "/; "* | "/.; "* | "/./; "* | *"$lt_multi_os_dir "* | *"$lt_multi_os_dir/ "*)
+    lt_multi_os_dir=
+    ;;
+  esac
   for lt_sys_path in $lt_search_path_spec; do
-    if test -d "$lt_sys_path/$lt_multi_os_dir"; then
-      lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path/$lt_multi_os_dir"
-    else
+    if test -d "$lt_sys_path$lt_multi_os_dir"; then
+      lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path$lt_multi_os_dir"
+    elif test -n "$lt_multi_os_dir"; then
       test -d "$lt_sys_path" && \
 	lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path"
     fi
   done
   lt_search_path_spec=`$ECHO "$lt_tmp_lt_search_path_spec" | awk '
-BEGIN {RS=" "; FS="/|\n";} {
-  lt_foo="";
-  lt_count=0;
+BEGIN {RS = " "; FS = "/|\n";} {
+  lt_foo = "";
+  lt_count = 0;
   for (lt_i = NF; lt_i > 0; lt_i--) {
     if ($lt_i != "" && $lt_i != ".") {
       if ($lt_i == "..") {
         lt_count++;
       } else {
         if (lt_count == 0) {
-          lt_foo="/" $lt_i lt_foo;
+          lt_foo = "/" $lt_i lt_foo;
         } else {
           lt_count--;
         }
@@ -2225,7 +2353,7 @@ BEGIN {RS=" "; FS="/|\n";} {
   # for these hosts.
   case $host_os in
     mingw* | cegcc*) lt_search_path_spec=`$ECHO "$lt_search_path_spec" |\
-      $SED 's,/\([[A-Za-z]]:\),\1,g'` ;;
+      $SED 's|/\([[A-Za-z]]:\)|\1|g'` ;;
   esac
   sys_lib_search_path_spec=`$ECHO "$lt_search_path_spec" | $lt_NL2SP`
 else
@@ -2234,7 +2362,7 @@ fi])
 library_names_spec=
 libname_spec='lib$name'
 soname_spec=
-shrext_cmds=".so"
+shrext_cmds=.so
 postinstall_cmds=
 postuninstall_cmds=
 finish_cmds=
@@ -2251,14 +2379,17 @@ hardcode_into_libs=no
 # flags to be left without arguments
 need_version=unknown
 
+AC_ARG_VAR([LT_SYS_LIBRARY_PATH],
+[User-defined run-time library search path.])
+
 case $host_os in
 aix3*)
   version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname.a'
   shlibpath_var=LIBPATH
 
   # AIX 3 has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}${shared_ext}$major'
+  soname_spec='$libname$release$shared_ext$major'
   ;;
 
 aix[[4-9]]*)
@@ -2266,41 +2397,91 @@ aix[[4-9]]*)
   need_lib_prefix=no
   need_version=no
   hardcode_into_libs=yes
-  if test "$host_cpu" = ia64; then
+  if test ia64 = "$host_cpu"; then
     # AIX 5 supports IA64
-    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
+    library_names_spec='$libname$release$shared_ext$major $libname$release$shared_ext$versuffix $libname$shared_ext'
     shlibpath_var=LD_LIBRARY_PATH
   else
     # With GCC up to 2.95.x, collect2 would create an import file
     # for dependence libraries.  The import file would start with
-    # the line `#! .'.  This would cause the generated library to
-    # depend on `.', always an invalid library.  This was fixed in
+    # the line '#! .'.  This would cause the generated library to
+    # depend on '.', always an invalid library.  This was fixed in
     # development snapshots of GCC prior to 3.0.
     case $host_os in
       aix4 | aix4.[[01]] | aix4.[[01]].*)
       if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
 	   echo ' yes '
-	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
+	   echo '#endif'; } | $CC -E - | $GREP yes > /dev/null; then
 	:
       else
 	can_build_shared=no
       fi
       ;;
     esac
-    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
+    # Using Import Files as archive members, it is possible to support
+    # filename-based versioning of shared library archives on AIX. While
+    # this would work for both with and without runtime linking, it will
+    # prevent static linking of such archives. So we do filename-based
+    # shared library versioning with .so extension only, which is used
+    # when both runtime linking and shared linking is enabled.
+    # Unfortunately, runtime linking may impact performance, so we do
+    # not want this to be the default eventually. Also, we use the
+    # versioned .so libs for executables only if there is the -brtl
+    # linker flag in LDFLAGS as well, or --with-aix-soname=svr4 only.
+    # To allow for filename-based versioning support, we need to create
+    # libNAME.so.V as an archive file, containing:
+    # *) an Import File, referring to the versioned filename of the
+    #    archive as well as the shared archive member, telling the
+    #    bitwidth (32 or 64) of that shared object, and providing the
+    #    list of exported symbols of that shared object, eventually
+    #    decorated with the 'weak' keyword
+    # *) the shared object with the F_LOADONLY flag set, to really avoid
+    #    it being seen by the linker.
+    # At run time we better use the real file rather than another symlink,
+    # but for link time we create the symlink libNAME.so -> libNAME.so.V
+
+    case $with_aix_soname,$aix_use_runtimelinking in
+    # AIX (on Power*) has no versioning support, so currently we cannot hardcode correct
     # soname into executable. Probably we can add versioning support to
     # collect2, so additional links can be useful in future.
-    if test "$aix_use_runtimelinking" = yes; then
+    aix,yes) # traditional libtool
+      dynamic_linker='AIX unversionable lib.so'
       # If using run time linking (on AIX 4.2 or later) use lib<name>.so
       # instead of lib<name>.a to let people know that these are not
       # typical AIX shared libraries.
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    else
+      library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+      ;;
+    aix,no) # traditional AIX only
+      dynamic_linker='AIX lib.a[(]lib.so.V[)]'
       # We preserve .a as extension for shared libraries through AIX4.2
       # and later when we are not doing run time linking.
-      library_names_spec='${libname}${release}.a $libname.a'
-      soname_spec='${libname}${release}${shared_ext}$major'
-    fi
+      library_names_spec='$libname$release.a $libname.a'
+      soname_spec='$libname$release$shared_ext$major'
+      ;;
+    svr4,*) # full svr4 only
+      dynamic_linker="AIX lib.so.V[(]$shared_archive_member_spec.o[)]"
+      library_names_spec='$libname$release$shared_ext$major $libname$shared_ext'
+      # We do not specify a path in Import Files, so LIBPATH fires.
+      shlibpath_overrides_runpath=yes
+      ;;
+    *,yes) # both, prefer svr4
+      dynamic_linker="AIX lib.so.V[(]$shared_archive_member_spec.o[)], lib.a[(]lib.so.V[)]"
+      library_names_spec='$libname$release$shared_ext$major $libname$shared_ext'
+      # unpreferred sharedlib libNAME.a needs extra handling
+      postinstall_cmds='test -n "$linkname" || linkname="$realname"~func_stripname "" ".so" "$linkname"~$install_shared_prog "$dir/$func_stripname_result.$libext" "$destdir/$func_stripname_result.$libext"~test -z "$tstripme" || test -z "$striplib" || $striplib "$destdir/$func_stripname_result.$libext"'
+      postuninstall_cmds='for n in $library_names $old_library; do :; done~func_stripname "" ".so" "$n"~test "$func_stripname_result" = "$n" || func_append rmfiles " $odir/$func_stripname_result.$libext"'
+      # We do not specify a path in Import Files, so LIBPATH fires.
+      shlibpath_overrides_runpath=yes
+      ;;
+    *,no) # both, prefer aix
+      dynamic_linker="AIX lib.a[(]lib.so.V[)], lib.so.V[(]$shared_archive_member_spec.o[)]"
+      library_names_spec='$libname$release.a $libname.a'
+      soname_spec='$libname$release$shared_ext$major'
+      # unpreferred sharedlib libNAME.so.V and symlink libNAME.so need extra handling
+      postinstall_cmds='test -z "$dlname" || $install_shared_prog $dir/$dlname $destdir/$dlname~test -z "$tstripme" || test -z "$striplib" || $striplib $destdir/$dlname~test -n "$linkname" || linkname=$realname~func_stripname "" ".a" "$linkname"~(cd "$destdir" && $LN_S -f $dlname $func_stripname_result.so)'
+      postuninstall_cmds='test -z "$dlname" || func_append rmfiles " $odir/$dlname"~for n in $old_library $library_names; do :; done~func_stripname "" ".a" "$n"~func_append rmfiles " $odir/$func_stripname_result.so"'
+      ;;
+    esac
     shlibpath_var=LIBPATH
   fi
   ;;
@@ -2310,18 +2491,18 @@ amigaos*)
   powerpc)
     # Since July 2007 AmigaOS4 officially supports .so libraries.
     # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
     ;;
   m68k)
     library_names_spec='$libname.ixlibrary $libname.a'
     # Create ${libname}_ixlibrary.a entries in /sys/libs.
-    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
+    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
     ;;
   esac
   ;;
 
 beos*)
-  library_names_spec='${libname}${shared_ext}'
+  library_names_spec='$libname$shared_ext'
   dynamic_linker="$host_os ld.so"
   shlibpath_var=LIBRARY_PATH
   ;;
@@ -2329,8 +2510,8 @@ beos*)
 bsdi[[45]]*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
   shlibpath_var=LD_LIBRARY_PATH
   sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
@@ -2342,7 +2523,7 @@ bsdi[[45]]*)
 
 cygwin* | mingw* | pw32* | cegcc*)
   version_type=windows
-  shrext_cmds=".dll"
+  shrext_cmds=.dll
   need_version=no
   need_lib_prefix=no
 
@@ -2351,8 +2532,8 @@ cygwin* | mingw* | pw32* | cegcc*)
     # gcc
     library_names_spec='$libname.dll.a'
     # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+    postinstall_cmds='base_file=`basename \$file`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; echo \$dlname'\''`~
       dldir=$destdir/`dirname \$dlpath`~
       test -d \$dldir || mkdir -p \$dldir~
       $install_prog $dir/$dlname \$dldir/$dlname~
@@ -2368,17 +2549,17 @@ cygwin* | mingw* | pw32* | cegcc*)
     case $host_os in
     cygwin*)
       # Cygwin DLLs use 'cyg' prefix rather than 'lib'
-      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+      soname_spec='`echo $libname | sed -e 's/^lib/cyg/'``echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext'
 m4_if([$1], [],[
       sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/lib/w32api"])
       ;;
     mingw* | cegcc*)
       # MinGW DLLs use traditional 'lib' prefix
-      soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+      soname_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext'
       ;;
     pw32*)
       # pw32 DLLs use 'pw' prefix rather than 'lib'
-      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
+      library_names_spec='`echo $libname | sed -e 's/^lib/pw/'``echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext'
       ;;
     esac
     dynamic_linker='Win32 ld.exe'
@@ -2387,8 +2568,8 @@ m4_if([$1], [],[
   *,cl*)
     # Native MSVC
     libname_spec='$name'
-    soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
-    library_names_spec='${libname}.dll.lib'
+    soname_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext'
+    library_names_spec='$libname.dll.lib'
 
     case $build_os in
     mingw*)
@@ -2415,7 +2596,7 @@ m4_if([$1], [],[
       sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
       ;;
     *)
-      sys_lib_search_path_spec="$LIB"
+      sys_lib_search_path_spec=$LIB
       if $ECHO "$sys_lib_search_path_spec" | [$GREP ';[c-zC-Z]:/' >/dev/null]; then
         # It is most probably a Windows format PATH.
         sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
@@ -2428,8 +2609,8 @@ m4_if([$1], [],[
     esac
 
     # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
+    postinstall_cmds='base_file=`basename \$file`~
+      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; echo \$dlname'\''`~
       dldir=$destdir/`dirname \$dlpath`~
       test -d \$dldir || mkdir -p \$dldir~
       $install_prog $dir/$dlname \$dldir/$dlname'
@@ -2442,7 +2623,7 @@ m4_if([$1], [],[
 
   *)
     # Assume MSVC wrapper
-    library_names_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext} $libname.lib'
+    library_names_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext $libname.lib'
     dynamic_linker='Win32 ld.exe'
     ;;
   esac
@@ -2455,8 +2636,8 @@ darwin* | rhapsody*)
   version_type=darwin
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
-  soname_spec='${libname}${release}${major}$shared_ext'
+  library_names_spec='$libname$release$major$shared_ext $libname$shared_ext'
+  soname_spec='$libname$release$major$shared_ext'
   shlibpath_overrides_runpath=yes
   shlibpath_var=DYLD_LIBRARY_PATH
   shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
@@ -2469,8 +2650,8 @@ dgux*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   ;;
 
@@ -2488,12 +2669,13 @@ freebsd* | dragonfly*)
   version_type=freebsd-$objformat
   case $version_type in
     freebsd-elf*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
+      library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+      soname_spec='$libname$release$shared_ext$major'
       need_version=no
       need_lib_prefix=no
       ;;
     freebsd-*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
+      library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix'
       need_version=yes
       ;;
   esac
@@ -2518,26 +2700,15 @@ freebsd* | dragonfly*)
   esac
   ;;
 
-gnu*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
-
 haiku*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
   dynamic_linker="$host_os runtime_loader"
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
+  shlibpath_overrides_runpath=no
   sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
   hardcode_into_libs=yes
   ;;
@@ -2555,14 +2726,15 @@ hpux9* | hpux10* | hpux11*)
     dynamic_linker="$host_os dld.so"
     shlibpath_var=LD_LIBRARY_PATH
     shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    if test "X$HPUX_IA64_MODE" = X32; then
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+    soname_spec='$libname$release$shared_ext$major'
+    if test 32 = "$HPUX_IA64_MODE"; then
       sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
+      sys_lib_dlsearch_path_spec=/usr/lib/hpux32
     else
       sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
+      sys_lib_dlsearch_path_spec=/usr/lib/hpux64
     fi
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
     ;;
   hppa*64*)
     shrext_cmds='.sl'
@@ -2570,8 +2742,8 @@ hpux9* | hpux10* | hpux11*)
     dynamic_linker="$host_os dld.sl"
     shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
     shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+    soname_spec='$libname$release$shared_ext$major'
     sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
     sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
     ;;
@@ -2580,8 +2752,8 @@ hpux9* | hpux10* | hpux11*)
     dynamic_linker="$host_os dld.sl"
     shlibpath_var=SHLIB_PATH
     shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+    soname_spec='$libname$release$shared_ext$major'
     ;;
   esac
   # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
@@ -2594,8 +2766,8 @@ interix[[3-9]]*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=no
@@ -2606,7 +2778,7 @@ irix5* | irix6* | nonstopux*)
   case $host_os in
     nonstopux*) version_type=nonstopux ;;
     *)
-	if test "$lt_cv_prog_gnu_ld" = yes; then
+	if test yes = "$lt_cv_prog_gnu_ld"; then
 		version_type=linux # correct to gnu/linux during the next big refactor
 	else
 		version_type=irix
@@ -2614,8 +2786,8 @@ irix5* | irix6* | nonstopux*)
   esac
   need_lib_prefix=no
   need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
+  soname_spec='$libname$release$shared_ext$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$release$shared_ext $libname$shared_ext'
   case $host_os in
   irix5* | nonstopux*)
     libsuff= shlibsuff=
@@ -2634,8 +2806,8 @@ irix5* | irix6* | nonstopux*)
   esac
   shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
   shlibpath_overrides_runpath=no
-  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
-  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  sys_lib_search_path_spec="/usr/lib$libsuff /lib$libsuff /usr/local/lib$libsuff"
+  sys_lib_dlsearch_path_spec="/usr/lib$libsuff /lib$libsuff"
   hardcode_into_libs=yes
   ;;
 
@@ -2644,13 +2816,33 @@ linux*oldld* | linux*aout* | linux*coff*)
   dynamic_linker=no
   ;;
 
+linux*android*)
+  version_type=none # Android doesn't support versioned libraries.
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='$libname$release$shared_ext'
+  soname_spec='$libname$release$shared_ext'
+  finish_cmds=
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  dynamic_linker='Android linker'
+  # Don't embed -rpath directories since the linker doesn't support them.
+  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+  ;;
+
 # This must be glibc/ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu)
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=no
@@ -2678,11 +2870,15 @@ linux* | k*bsd*-gnu | kopensolaris*-gnu)
   # Add ABI-specific directories to the system library path.
   sys_lib_dlsearch_path_spec="/lib64 /usr/lib64 /lib /usr/lib"
 
-  # Append ld.so.conf contents to the search path
+  # Ideally, we could use ldconfig to report *all* directores which are
+  # searched for libraries, however this is still not possible.  Aside from not
+  # being certain /sbin/ldconfig is available, command
+  # 'ldconfig -N -X -v | grep ^/' on 64bit Fedora does not report /usr/lib64,
+  # even though it is searched at run-time.  Try to do the best guess by
+  # appending ld.so.conf contents (and includes) to the search path.
   if test -f /etc/ld.so.conf; then
     lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \[$]2)); skip = 1; } { if (!skip) print \[$]0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
     sys_lib_dlsearch_path_spec="$sys_lib_dlsearch_path_spec $lt_ld_extra"
-
   fi
 
   # We used to test for /lib/ld.so.1 and disable shared libraries on
@@ -2699,12 +2895,12 @@ netbsd*)
   need_lib_prefix=no
   need_version=no
   if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix'
     finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
     dynamic_linker='NetBSD (a.out) ld.so'
   else
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
+    library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+    soname_spec='$libname$release$shared_ext$major'
     dynamic_linker='NetBSD ld.elf_so'
   fi
   shlibpath_var=LD_LIBRARY_PATH
@@ -2714,7 +2910,7 @@ netbsd*)
 
 newsos6)
   version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=yes
   ;;
@@ -2723,58 +2919,68 @@ newsos6)
   version_type=qnx
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=no
   hardcode_into_libs=yes
   dynamic_linker='ldqnx.so'
   ;;
 
-openbsd*)
+openbsd* | bitrig*)
   version_type=sunos
-  sys_lib_dlsearch_path_spec="/usr/lib"
+  sys_lib_dlsearch_path_spec=/usr/lib
   need_lib_prefix=no
-  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
-  case $host_os in
-    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
-    *)				need_version=no  ;;
-  esac
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-    case $host_os in
-      openbsd2.[[89]] | openbsd2.[[89]].*)
-	shlibpath_overrides_runpath=no
-	;;
-      *)
-	shlibpath_overrides_runpath=yes
-	;;
-      esac
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then
+    need_version=no
   else
-    shlibpath_overrides_runpath=yes
+    need_version=yes
   fi
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
   ;;
 
 os2*)
   libname_spec='$name'
-  shrext_cmds=".dll"
+  version_type=windows
+  shrext_cmds=.dll
+  need_version=no
   need_lib_prefix=no
-  library_names_spec='$libname${shared_ext} $libname.a'
+  # OS/2 can only load a DLL with a base name of 8 characters or less.
+  soname_spec='`test -n "$os2dllname" && libname="$os2dllname";
+    v=$($ECHO $release$versuffix | tr -d .-);
+    n=$($ECHO $libname | cut -b -$((8 - ${#v})) | tr . _);
+    $ECHO $n$v`$shared_ext'
+  library_names_spec='${libname}_dll.$libext'
   dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
+  shlibpath_var=BEGINLIBPATH
+  sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
+  postinstall_cmds='base_file=`basename \$file`~
+    dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; $ECHO \$dlname'\''`~
+    dldir=$destdir/`dirname \$dlpath`~
+    test -d \$dldir || mkdir -p \$dldir~
+    $install_prog $dir/$dlname \$dldir/$dlname~
+    chmod a+x \$dldir/$dlname~
+    if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
+      eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
+    fi'
+  postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; $ECHO \$dlname'\''`~
+    dlpath=$dir/\$dldll~
+    $RM \$dlpath'
   ;;
 
 osf3* | osf4* | osf5*)
   version_type=osf
   need_lib_prefix=no
   need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  soname_spec='$libname$release$shared_ext$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
   shlibpath_var=LD_LIBRARY_PATH
   sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
-  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
   ;;
 
 rdos*)
@@ -2785,8 +2991,8 @@ solaris*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=yes
   hardcode_into_libs=yes
@@ -2796,11 +3002,11 @@ solaris*)
 
 sunos4*)
   version_type=sunos
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix'
   finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=yes
-  if test "$with_gnu_ld" = yes; then
+  if test yes = "$with_gnu_ld"; then
     need_lib_prefix=no
   fi
   need_version=yes
@@ -2808,8 +3014,8 @@ sunos4*)
 
 sysv4 | sysv4.3*)
   version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   case $host_vendor in
     sni)
@@ -2830,24 +3036,24 @@ sysv4 | sysv4.3*)
   ;;
 
 sysv4*MP*)
-  if test -d /usr/nec ;then
+  if test -d /usr/nec; then
     version_type=linux # correct to gnu/linux during the next big refactor
-    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
-    soname_spec='$libname${shared_ext}.$major'
+    library_names_spec='$libname$shared_ext.$versuffix $libname$shared_ext.$major $libname$shared_ext'
+    soname_spec='$libname$shared_ext.$major'
     shlibpath_var=LD_LIBRARY_PATH
   fi
   ;;
 
 sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
-  version_type=freebsd-elf
+  version_type=sco
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=yes
   hardcode_into_libs=yes
-  if test "$with_gnu_ld" = yes; then
+  if test yes = "$with_gnu_ld"; then
     sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
   else
     sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
@@ -2865,7 +3071,7 @@ tpf*)
   version_type=linux # correct to gnu/linux during the next big refactor
   need_lib_prefix=no
   need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
   shlibpath_var=LD_LIBRARY_PATH
   shlibpath_overrides_runpath=no
   hardcode_into_libs=yes
@@ -2873,8 +3079,8 @@ tpf*)
 
 uts4*)
   version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
+  library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext'
+  soname_spec='$libname$release$shared_ext$major'
   shlibpath_var=LD_LIBRARY_PATH
   ;;
 
@@ -2883,20 +3089,30 @@ uts4*)
   ;;
 esac
 AC_MSG_RESULT([$dynamic_linker])
-test "$dynamic_linker" = no && can_build_shared=no
+test no = "$dynamic_linker" && can_build_shared=no
 
 variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
-if test "$GCC" = yes; then
+if test yes = "$GCC"; then
   variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
 fi
 
-if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
-  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
+if test set = "${lt_cv_sys_lib_search_path_spec+set}"; then
+  sys_lib_search_path_spec=$lt_cv_sys_lib_search_path_spec
 fi
-if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
-  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
+
+if test set = "${lt_cv_sys_lib_dlsearch_path_spec+set}"; then
+  sys_lib_dlsearch_path_spec=$lt_cv_sys_lib_dlsearch_path_spec
 fi
 
+# remember unaugmented sys_lib_dlsearch_path content for libtool script decls...
+configure_time_dlsearch_path=$sys_lib_dlsearch_path_spec
+
+# ... but it needs LT_SYS_LIBRARY_PATH munging for other configure-time code
+func_munge_path_list sys_lib_dlsearch_path_spec "$LT_SYS_LIBRARY_PATH"
+
+# to be used as default LT_SYS_LIBRARY_PATH value in generated libtool
+configure_time_lt_sys_library_path=$LT_SYS_LIBRARY_PATH
+
 _LT_DECL([], [variables_saved_for_relink], [1],
     [Variables whose values should be saved in libtool wrapper scripts and
     restored at link time])
@@ -2929,39 +3145,41 @@ _LT_DECL([], [hardcode_into_libs], [0],
     [Whether we should hardcode library paths into libraries])
 _LT_DECL([], [sys_lib_search_path_spec], [2],
     [Compile-time system search path for libraries])
-_LT_DECL([], [sys_lib_dlsearch_path_spec], [2],
-    [Run-time system search path for libraries])
+_LT_DECL([sys_lib_dlsearch_path_spec], [configure_time_dlsearch_path], [2],
+    [Detected run-time system search path for libraries])
+_LT_DECL([], [configure_time_lt_sys_library_path], [2],
+    [Explicit LT_SYS_LIBRARY_PATH set during ./configure time])
 ])# _LT_SYS_DYNAMIC_LINKER
 
 
 # _LT_PATH_TOOL_PREFIX(TOOL)
 # --------------------------
-# find a file program which can recognize shared library
+# find a file program that can recognize shared library
 AC_DEFUN([_LT_PATH_TOOL_PREFIX],
 [m4_require([_LT_DECL_EGREP])dnl
 AC_MSG_CHECKING([for $1])
 AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
 [case $MAGIC_CMD in
 [[\\/*] |  ?:[\\/]*])
-  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  lt_cv_path_MAGIC_CMD=$MAGIC_CMD # Let the user override the test with a path.
   ;;
 *)
-  lt_save_MAGIC_CMD="$MAGIC_CMD"
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  lt_save_MAGIC_CMD=$MAGIC_CMD
+  lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR
 dnl $ac_dummy forces splitting on constant user-supplied paths.
 dnl POSIX.2 word splitting is done only on the output of word expansions,
 dnl not every word.  This closes a longstanding sh security hole.
   ac_dummy="m4_if([$2], , $PATH, [$2])"
   for ac_dir in $ac_dummy; do
-    IFS="$lt_save_ifs"
+    IFS=$lt_save_ifs
     test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$1; then
-      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+    if test -f "$ac_dir/$1"; then
+      lt_cv_path_MAGIC_CMD=$ac_dir/"$1"
       if test -n "$file_magic_test_file"; then
 	case $deplibs_check_method in
 	"file_magic "*)
 	  file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"`
-	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  MAGIC_CMD=$lt_cv_path_MAGIC_CMD
 	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
 	    $EGREP "$file_magic_regex" > /dev/null; then
 	    :
@@ -2984,11 +3202,11 @@ _LT_EOF
       break
     fi
   done
-  IFS="$lt_save_ifs"
-  MAGIC_CMD="$lt_save_MAGIC_CMD"
+  IFS=$lt_save_ifs
+  MAGIC_CMD=$lt_save_MAGIC_CMD
   ;;
 esac])
-MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+MAGIC_CMD=$lt_cv_path_MAGIC_CMD
 if test -n "$MAGIC_CMD"; then
   AC_MSG_RESULT($MAGIC_CMD)
 else
@@ -3006,7 +3224,7 @@ dnl AC_DEFUN([AC_PATH_TOOL_PREFIX], [])
 
 # _LT_PATH_MAGIC
 # --------------
-# find a file program which can recognize a shared library
+# find a file program that can recognize a shared library
 m4_defun([_LT_PATH_MAGIC],
 [_LT_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin$PATH_SEPARATOR$PATH)
 if test -z "$lt_cv_path_MAGIC_CMD"; then
@@ -3033,16 +3251,16 @@ m4_require([_LT_PROG_ECHO_BACKSLASH])dnl
 AC_ARG_WITH([gnu-ld],
     [AS_HELP_STRING([--with-gnu-ld],
 	[assume the C compiler uses GNU ld @<:@default=no@:>@])],
-    [test "$withval" = no || with_gnu_ld=yes],
+    [test no = "$withval" || with_gnu_ld=yes],
     [with_gnu_ld=no])dnl
 
 ac_prog=ld
-if test "$GCC" = yes; then
+if test yes = "$GCC"; then
   # Check if gcc -print-prog-name=ld gives a path.
   AC_MSG_CHECKING([for ld used by $CC])
   case $host in
   *-*-mingw*)
-    # gcc leaves a trailing carriage return which upsets mingw
+    # gcc leaves a trailing carriage return, which upsets mingw
     ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
   *)
     ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
@@ -3056,7 +3274,7 @@ if test "$GCC" = yes; then
       while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
 	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
       done
-      test -z "$LD" && LD="$ac_prog"
+      test -z "$LD" && LD=$ac_prog
       ;;
   "")
     # If it fails, then pretend we aren't using GCC.
@@ -3067,37 +3285,37 @@ if test "$GCC" = yes; then
     with_gnu_ld=unknown
     ;;
   esac
-elif test "$with_gnu_ld" = yes; then
+elif test yes = "$with_gnu_ld"; then
   AC_MSG_CHECKING([for GNU ld])
 else
   AC_MSG_CHECKING([for non-GNU ld])
 fi
 AC_CACHE_VAL(lt_cv_path_LD,
 [if test -z "$LD"; then
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR
   for ac_dir in $PATH; do
-    IFS="$lt_save_ifs"
+    IFS=$lt_save_ifs
     test -z "$ac_dir" && ac_dir=.
     if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
-      lt_cv_path_LD="$ac_dir/$ac_prog"
+      lt_cv_path_LD=$ac_dir/$ac_prog
       # Check to see if the program is GNU ld.  I'd rather use --version,
       # but apparently some variants of GNU ld only accept -v.
       # Break only if it was the GNU/non-GNU ld that we prefer.
       case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
       *GNU* | *'with BFD'*)
-	test "$with_gnu_ld" != no && break
+	test no != "$with_gnu_ld" && break
 	;;
       *)
-	test "$with_gnu_ld" != yes && break
+	test yes != "$with_gnu_ld" && break
 	;;
       esac
     fi
   done
-  IFS="$lt_save_ifs"
+  IFS=$lt_save_ifs
 else
-  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+  lt_cv_path_LD=$LD # Let the user override the test with a path.
 fi])
-LD="$lt_cv_path_LD"
+LD=$lt_cv_path_LD
 if test -n "$LD"; then
   AC_MSG_RESULT($LD)
 else
@@ -3151,13 +3369,13 @@ esac
 reload_cmds='$LD$reload_flag -o $output$reload_objs'
 case $host_os in
   cygwin* | mingw* | pw32* | cegcc*)
-    if test "$GCC" != yes; then
+    if test yes != "$GCC"; then
       reload_cmds=false
     fi
     ;;
   darwin*)
-    if test "$GCC" = yes; then
-      reload_cmds='$LTCC $LTCFLAGS -nostdlib ${wl}-r -o $output$reload_objs'
+    if test yes = "$GCC"; then
+      reload_cmds='$LTCC $LTCFLAGS -nostdlib $wl-r -o $output$reload_objs'
     else
       reload_cmds='$LD$reload_flag -o $output$reload_objs'
     fi
@@ -3168,6 +3386,43 @@ _LT_TAGDECL([], [reload_cmds], [2])dnl
 ])# _LT_CMD_RELOAD
 
 
+# _LT_PATH_DD
+# -----------
+# find a working dd
+m4_defun([_LT_PATH_DD],
+[AC_CACHE_CHECK([for a working dd], [ac_cv_path_lt_DD],
+[printf 0123456789abcdef0123456789abcdef >conftest.i
+cat conftest.i conftest.i >conftest2.i
+: ${lt_DD:=$DD}
+AC_PATH_PROGS_FEATURE_CHECK([lt_DD], [dd],
+[if "$ac_path_lt_DD" bs=32 count=1 <conftest2.i >conftest.out 2>/dev/null; then
+  cmp -s conftest.i conftest.out \
+  && ac_cv_path_lt_DD="$ac_path_lt_DD" ac_path_lt_DD_found=:
+fi])
+rm -f conftest.i conftest2.i conftest.out])
+])# _LT_PATH_DD
+
+
+# _LT_CMD_TRUNCATE
+# ----------------
+# find command to truncate a binary pipe
+m4_defun([_LT_CMD_TRUNCATE],
+[m4_require([_LT_PATH_DD])
+AC_CACHE_CHECK([how to truncate binary pipes], [lt_cv_truncate_bin],
+[printf 0123456789abcdef0123456789abcdef >conftest.i
+cat conftest.i conftest.i >conftest2.i
+lt_cv_truncate_bin=
+if "$ac_cv_path_lt_DD" bs=32 count=1 <conftest2.i >conftest.out 2>/dev/null; then
+  cmp -s conftest.i conftest.out \
+  && lt_cv_truncate_bin="$ac_cv_path_lt_DD bs=4096 count=1"
+fi
+rm -f conftest.i conftest2.i conftest.out
+test -z "$lt_cv_truncate_bin" && lt_cv_truncate_bin="$SED -e 4q"])
+_LT_DECL([lt_truncate_bin], [lt_cv_truncate_bin], [1],
+  [Command to truncate a binary pipe])
+])# _LT_CMD_TRUNCATE
+
+
 # _LT_CHECK_MAGIC_METHOD
 # ----------------------
 # how to check for library dependencies
@@ -3183,13 +3438,13 @@ lt_cv_deplibs_check_method='unknown'
 # Need to set the preceding variable on all platforms that support
 # interlibrary dependencies.
 # 'none' -- dependencies not supported.
-# `unknown' -- same as none, but documents that we really don't know.
+# 'unknown' -- same as none, but documents that we really don't know.
 # 'pass_all' -- all dependencies passed with no checks.
 # 'test_compile' -- check by making test program.
 # 'file_magic [[regex]]' -- check by looking for files in library path
-# which responds to the $file_magic_cmd with a given extended regex.
-# If you have `file' or equivalent on your system and you're not sure
-# whether `pass_all' will *always* work, you probably want this one.
+# that responds to the $file_magic_cmd with a given extended regex.
+# If you have 'file' or equivalent on your system and you're not sure
+# whether 'pass_all' will *always* work, you probably want this one.
 
 case $host_os in
 aix[[4-9]]*)
@@ -3216,8 +3471,7 @@ mingw* | pw32*)
   # Base MSYS/MinGW do not provide the 'file' command needed by
   # func_win32_libid shell function, so use a weaker test based on 'objdump',
   # unless we find 'file', for example because we are cross-compiling.
-  # func_win32_libid assumes BSD nm, so disallow it if using MS dumpbin.
-  if ( test "$lt_cv_nm_interface" = "BSD nm" && file / ) >/dev/null 2>&1; then
+  if ( file / ) >/dev/null 2>&1; then
     lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
     lt_cv_file_magic_cmd='func_win32_libid'
   else
@@ -3253,10 +3507,6 @@ freebsd* | dragonfly*)
   fi
   ;;
 
-gnu*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
 haiku*)
   lt_cv_deplibs_check_method=pass_all
   ;;
@@ -3295,7 +3545,7 @@ irix5* | irix6* | nonstopux*)
   ;;
 
 # This must be glibc/ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu)
+linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
   lt_cv_deplibs_check_method=pass_all
   ;;
 
@@ -3317,8 +3567,8 @@ newos6*)
   lt_cv_deplibs_check_method=pass_all
   ;;
 
-openbsd*)
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+openbsd* | bitrig*)
+  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then
     lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|\.so|_pic\.a)$'
   else
     lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
@@ -3371,6 +3621,9 @@ sysv4 | sysv4.3*)
 tpf*)
   lt_cv_deplibs_check_method=pass_all
   ;;
+os2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
 esac
 ])
 
@@ -3411,33 +3664,38 @@ AC_DEFUN([LT_PATH_NM],
 AC_CACHE_CHECK([for BSD- or MS-compatible name lister (nm)], lt_cv_path_NM,
 [if test -n "$NM"; then
   # Let the user override the test.
-  lt_cv_path_NM="$NM"
+  lt_cv_path_NM=$NM
 else
-  lt_nm_to_check="${ac_tool_prefix}nm"
+  lt_nm_to_check=${ac_tool_prefix}nm
   if test -n "$ac_tool_prefix" && test "$build" = "$host"; then
     lt_nm_to_check="$lt_nm_to_check nm"
   fi
   for lt_tmp_nm in $lt_nm_to_check; do
-    lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+    lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR
     for ac_dir in $PATH /usr/ccs/bin/elf /usr/ccs/bin /usr/ucb /bin; do
-      IFS="$lt_save_ifs"
+      IFS=$lt_save_ifs
       test -z "$ac_dir" && ac_dir=.
-      tmp_nm="$ac_dir/$lt_tmp_nm"
-      if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext" ; then
+      tmp_nm=$ac_dir/$lt_tmp_nm
+      if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext"; then
 	# Check to see if the nm accepts a BSD-compat flag.
-	# Adding the `sed 1q' prevents false positives on HP-UX, which says:
+	# Adding the 'sed 1q' prevents false positives on HP-UX, which says:
 	#   nm: unknown option "B" ignored
 	# Tru64's nm complains that /dev/null is an invalid object file
-	case `"$tmp_nm" -B /dev/null 2>&1 | sed '1q'` in
-	*/dev/null* | *'Invalid file or object type'*)
+	# MSYS converts /dev/null to NUL, MinGW nm treats NUL as empty
+	case $build_os in
+	mingw*) lt_bad_file=conftest.nm/nofile ;;
+	*) lt_bad_file=/dev/null ;;
+	esac
+	case `"$tmp_nm" -B $lt_bad_file 2>&1 | sed '1q'` in
+	*$lt_bad_file* | *'Invalid file or object type'*)
 	  lt_cv_path_NM="$tmp_nm -B"
-	  break
+	  break 2
 	  ;;
 	*)
 	  case `"$tmp_nm" -p /dev/null 2>&1 | sed '1q'` in
 	  */dev/null*)
 	    lt_cv_path_NM="$tmp_nm -p"
-	    break
+	    break 2
 	    ;;
 	  *)
 	    lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
@@ -3448,21 +3706,21 @@ else
 	esac
       fi
     done
-    IFS="$lt_save_ifs"
+    IFS=$lt_save_ifs
   done
   : ${lt_cv_path_NM=no}
 fi])
-if test "$lt_cv_path_NM" != "no"; then
-  NM="$lt_cv_path_NM"
+if test no != "$lt_cv_path_NM"; then
+  NM=$lt_cv_path_NM
 else
   # Didn't find any BSD compatible name lister, look for dumpbin.
   if test -n "$DUMPBIN"; then :
     # Let the user override the test.
   else
     AC_CHECK_TOOLS(DUMPBIN, [dumpbin "link -dump"], :)
-    case `$DUMPBIN -symbols /dev/null 2>&1 | sed '1q'` in
+    case `$DUMPBIN -symbols -headers /dev/null 2>&1 | sed '1q'` in
     *COFF*)
-      DUMPBIN="$DUMPBIN -symbols"
+      DUMPBIN="$DUMPBIN -symbols -headers"
       ;;
     *)
       DUMPBIN=:
@@ -3470,8 +3728,8 @@ else
     esac
   fi
   AC_SUBST([DUMPBIN])
-  if test "$DUMPBIN" != ":"; then
-    NM="$DUMPBIN"
+  if test : != "$DUMPBIN"; then
+    NM=$DUMPBIN
   fi
 fi
 test -z "$NM" && NM=nm
@@ -3517,8 +3775,8 @@ lt_cv_sharedlib_from_linklib_cmd,
 
 case $host_os in
 cygwin* | mingw* | pw32* | cegcc*)
-  # two different shell functions defined in ltmain.sh
-  # decide which to use based on capabilities of $DLLTOOL
+  # two different shell functions defined in ltmain.sh;
+  # decide which one to use based on capabilities of $DLLTOOL
   case `$DLLTOOL --help 2>&1` in
   *--identify-strict*)
     lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib
@@ -3530,7 +3788,7 @@ cygwin* | mingw* | pw32* | cegcc*)
   ;;
 *)
   # fallback: assume linklib IS sharedlib
-  lt_cv_sharedlib_from_linklib_cmd="$ECHO"
+  lt_cv_sharedlib_from_linklib_cmd=$ECHO
   ;;
 esac
 ])
@@ -3557,13 +3815,28 @@ AC_CACHE_CHECK([if $MANIFEST_TOOL is a manifest tool], [lt_cv_path_mainfest_tool
     lt_cv_path_mainfest_tool=yes
   fi
   rm -f conftest*])
-if test "x$lt_cv_path_mainfest_tool" != xyes; then
+if test yes != "$lt_cv_path_mainfest_tool"; then
   MANIFEST_TOOL=:
 fi
 _LT_DECL([], [MANIFEST_TOOL], [1], [Manifest tool])dnl
 ])# _LT_PATH_MANIFEST_TOOL
 
 
+# _LT_DLL_DEF_P([FILE])
+# ---------------------
+# True iff FILE is a Windows DLL '.def' file.
+# Keep in sync with func_dll_def_p in the libtool script
+AC_DEFUN([_LT_DLL_DEF_P],
+[dnl
+  test DEF = "`$SED -n dnl
+    -e '\''s/^[[	 ]]*//'\'' dnl Strip leading whitespace
+    -e '\''/^\(;.*\)*$/d'\'' dnl      Delete empty lines and comments
+    -e '\''s/^\(EXPORTS\|LIBRARY\)\([[	 ]].*\)*$/DEF/p'\'' dnl
+    -e q dnl                          Only consider the first "real" line
+    $1`" dnl
+])# _LT_DLL_DEF_P
+
+
 # LT_LIB_M
 # --------
 # check for math library
@@ -3575,11 +3848,11 @@ case $host in
   # These system don't have libm, or don't need it
   ;;
 *-ncr-sysv4.3*)
-  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM=-lmw)
   AC_CHECK_LIB(m, cos, LIBM="$LIBM -lm")
   ;;
 *)
-  AC_CHECK_LIB(m, cos, LIBM="-lm")
+  AC_CHECK_LIB(m, cos, LIBM=-lm)
   ;;
 esac
 AC_SUBST([LIBM])
@@ -3598,7 +3871,7 @@ m4_defun([_LT_COMPILER_NO_RTTI],
 
 _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
 
-if test "$GCC" = yes; then
+if test yes = "$GCC"; then
   case $cc_basename in
   nvcc*)
     _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -Xcompiler -fno-builtin' ;;
@@ -3650,7 +3923,7 @@ cygwin* | mingw* | pw32* | cegcc*)
   symcode='[[ABCDGISTW]]'
   ;;
 hpux*)
-  if test "$host_cpu" = ia64; then
+  if test ia64 = "$host_cpu"; then
     symcode='[[ABCDEGRST]]'
   fi
   ;;
@@ -3683,14 +3956,44 @@ case `$NM -V 2>&1` in
   symcode='[[ABCDGIRSTW]]' ;;
 esac
 
+if test "$lt_cv_nm_interface" = "MS dumpbin"; then
+  # Gets list of data symbols to import.
+  lt_cv_sys_global_symbol_to_import="sed -n -e 's/^I .* \(.*\)$/\1/p'"
+  # Adjust the below global symbol transforms to fixup imported variables.
+  lt_cdecl_hook=" -e 's/^I .* \(.*\)$/extern __declspec(dllimport) char \1;/p'"
+  lt_c_name_hook=" -e 's/^I .* \(.*\)$/  {\"\1\", (void *) 0},/p'"
+  lt_c_name_lib_hook="\
+  -e 's/^I .* \(lib.*\)$/  {\"\1\", (void *) 0},/p'\
+  -e 's/^I .* \(.*\)$/  {\"lib\1\", (void *) 0},/p'"
+else
+  # Disable hooks by default.
+  lt_cv_sys_global_symbol_to_import=
+  lt_cdecl_hook=
+  lt_c_name_hook=
+  lt_c_name_lib_hook=
+fi
+
 # Transform an extracted symbol line into a proper C declaration.
 # Some systems (esp. on ia64) link data and code symbols differently,
 # so use this general approach.
-lt_cv_sys_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern int \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+lt_cv_sys_global_symbol_to_cdecl="sed -n"\
+$lt_cdecl_hook\
+" -e 's/^T .* \(.*\)$/extern int \1();/p'"\
+" -e 's/^$symcode$symcode* .* \(.*\)$/extern char \1;/p'"
 
 # Transform an extracted symbol line into symbol name and symbol address
-lt_cv_sys_global_symbol_to_c_name_address="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p'"
-lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \(lib[[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"lib\2\", (void *) \&\2},/p'"
+lt_cv_sys_global_symbol_to_c_name_address="sed -n"\
+$lt_c_name_hook\
+" -e 's/^: \(.*\) .*$/  {\"\1\", (void *) 0},/p'"\
+" -e 's/^$symcode$symcode* .* \(.*\)$/  {\"\1\", (void *) \&\1},/p'"
+
+# Transform an extracted symbol line into symbol name with lib prefix and
+# symbol address.
+lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n"\
+$lt_c_name_lib_hook\
+" -e 's/^: \(.*\) .*$/  {\"\1\", (void *) 0},/p'"\
+" -e 's/^$symcode$symcode* .* \(lib.*\)$/  {\"\1\", (void *) \&\1},/p'"\
+" -e 's/^$symcode$symcode* .* \(.*\)$/  {\"lib\1\", (void *) \&\1},/p'"
 
 # Handle CRLF in mingw tool chain
 opt_cr=
@@ -3708,21 +4011,24 @@ for ac_symprfx in "" "_"; do
 
   # Write the raw and C identifiers.
   if test "$lt_cv_nm_interface" = "MS dumpbin"; then
-    # Fake it for dumpbin and say T for any non-static function
-    # and D for any global variable.
+    # Fake it for dumpbin and say T for any non-static function,
+    # D for any global variable and I for any imported variable.
     # Also find C++ and __fastcall symbols from MSVC++,
     # which start with @ or ?.
     lt_cv_sys_global_symbol_pipe="$AWK ['"\
 "     {last_section=section; section=\$ 3};"\
 "     /^COFF SYMBOL TABLE/{for(i in hide) delete hide[i]};"\
 "     /Section length .*#relocs.*(pick any)/{hide[last_section]=1};"\
+"     /^ *Symbol name *: /{split(\$ 0,sn,\":\"); si=substr(sn[2],2)};"\
+"     /^ *Type *: code/{print \"T\",si,substr(si,length(prfx))};"\
+"     /^ *Type *: data/{print \"I\",si,substr(si,length(prfx))};"\
 "     \$ 0!~/External *\|/{next};"\
 "     / 0+ UNDEF /{next}; / UNDEF \([^|]\)*()/{next};"\
 "     {if(hide[section]) next};"\
-"     {f=0}; \$ 0~/\(\).*\|/{f=1}; {printf f ? \"T \" : \"D \"};"\
-"     {split(\$ 0, a, /\||\r/); split(a[2], s)};"\
-"     s[1]~/^[@?]/{print s[1], s[1]; next};"\
-"     s[1]~prfx {split(s[1],t,\"@\"); print t[1], substr(t[1],length(prfx))}"\
+"     {f=\"D\"}; \$ 0~/\(\).*\|/{f=\"T\"};"\
+"     {split(\$ 0,a,/\||\r/); split(a[2],s)};"\
+"     s[1]~/^[@?]/{print f,s[1],s[1]; next};"\
+"     s[1]~prfx {split(s[1],t,\"@\"); print f,t[1],substr(t[1],length(prfx))}"\
 "     ' prfx=^$ac_symprfx]"
   else
     lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[[	 ]]\($symcode$symcode*\)[[	 ]][[	 ]]*$ac_symprfx$sympat$opt_cr$/$symxfrm/p'"
@@ -3762,11 +4068,11 @@ _LT_EOF
 	if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
 	  cat <<_LT_EOF > conftest.$ac_ext
 /* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
-#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
-/* DATA imports from DLLs on WIN32 con't be const, because runtime
+#if defined _WIN32 || defined __CYGWIN__ || defined _WIN32_WCE
+/* DATA imports from DLLs on WIN32 can't be const, because runtime
    relocations are performed -- see ld's documentation on pseudo-relocs.  */
 # define LT@&t@_DLSYM_CONST
-#elif defined(__osf__)
+#elif defined __osf__
 /* This system does not cope well with relocations in const data.  */
 # define LT@&t@_DLSYM_CONST
 #else
@@ -3792,7 +4098,7 @@ lt__PROGRAM__LTX_preloaded_symbols[[]] =
 {
   { "@PROGRAM@", (void *) 0 },
 _LT_EOF
-	  $SED "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (void *) \&\2},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext
+	  $SED "s/^$symcode$symcode* .* \(.*\)$/  {\"\1\", (void *) \&\1},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext
 	  cat <<\_LT_EOF >> conftest.$ac_ext
   {0, (void *) 0}
 };
@@ -3812,9 +4118,9 @@ _LT_EOF
 	  mv conftest.$ac_objext conftstm.$ac_objext
 	  lt_globsym_save_LIBS=$LIBS
 	  lt_globsym_save_CFLAGS=$CFLAGS
-	  LIBS="conftstm.$ac_objext"
+	  LIBS=conftstm.$ac_objext
 	  CFLAGS="$CFLAGS$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)"
-	  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext}; then
+	  if AC_TRY_EVAL(ac_link) && test -s conftest$ac_exeext; then
 	    pipe_works=yes
 	  fi
 	  LIBS=$lt_globsym_save_LIBS
@@ -3835,7 +4141,7 @@ _LT_EOF
   rm -rf conftest* conftst*
 
   # Do not use the global_symbol_pipe unless it works.
-  if test "$pipe_works" = yes; then
+  if test yes = "$pipe_works"; then
     break
   else
     lt_cv_sys_global_symbol_pipe=
@@ -3862,12 +4168,16 @@ _LT_DECL([global_symbol_pipe], [lt_cv_sys_global_symbol_pipe], [1],
     [Take the output of nm and produce a listing of raw symbols and C names])
 _LT_DECL([global_symbol_to_cdecl], [lt_cv_sys_global_symbol_to_cdecl], [1],
     [Transform the output of nm in a proper C declaration])
+_LT_DECL([global_symbol_to_import], [lt_cv_sys_global_symbol_to_import], [1],
+    [Transform the output of nm into a list of symbols to manually relocate])
 _LT_DECL([global_symbol_to_c_name_address],
     [lt_cv_sys_global_symbol_to_c_name_address], [1],
     [Transform the output of nm in a C name address pair])
 _LT_DECL([global_symbol_to_c_name_address_lib_prefix],
     [lt_cv_sys_global_symbol_to_c_name_address_lib_prefix], [1],
     [Transform the output of nm in a C name address pair when lib prefix is needed])
+_LT_DECL([nm_interface], [lt_cv_nm_interface], [1],
+    [The name lister interface])
 _LT_DECL([], [nm_file_list_spec], [1],
     [Specify filename containing input files for $NM])
 ]) # _LT_CMD_GLOBAL_SYMBOLS
@@ -3883,17 +4193,18 @@ _LT_TAGVAR(lt_prog_compiler_static, $1)=
 
 m4_if([$1], [CXX], [
   # C++ specific cases for pic, static, wl, etc.
-  if test "$GXX" = yes; then
+  if test yes = "$GXX"; then
     _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
     _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
 
     case $host_os in
     aix*)
       # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
+      if test ia64 = "$host_cpu"; then
 	# AIX 5 now supports IA64 processor
 	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
       fi
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
       ;;
 
     amigaos*)
@@ -3904,8 +4215,8 @@ m4_if([$1], [CXX], [
         ;;
       m68k)
             # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
+            # adding the '-m68020' flag to GCC prevents building anything better,
+            # like '-m68040'.
             _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
         ;;
       esac
@@ -3921,6 +4232,11 @@ m4_if([$1], [CXX], [
       # (--disable-auto-import) libraries
       m4_if([$1], [GCJ], [],
 	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      case $host_os in
+      os2*)
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static'
+	;;
+      esac
       ;;
     darwin* | rhapsody*)
       # PIC is the default on this platform
@@ -3970,7 +4286,7 @@ m4_if([$1], [CXX], [
     case $host_os in
       aix[[4-9]]*)
 	# All AIX code is PIC.
-	if test "$host_cpu" = ia64; then
+	if test ia64 = "$host_cpu"; then
 	  # AIX 5 now supports IA64 processor
 	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
 	else
@@ -4011,14 +4327,14 @@ m4_if([$1], [CXX], [
 	case $cc_basename in
 	  CC*)
 	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
-	    if test "$host_cpu" != ia64; then
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive'
+	    if test ia64 != "$host_cpu"; then
 	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
 	    fi
 	    ;;
 	  aCC*)
 	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
+	    _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive'
 	    case $host_cpu in
 	    hppa*64*|ia64*)
 	      # +Z the default
@@ -4047,7 +4363,7 @@ m4_if([$1], [CXX], [
 	    ;;
 	esac
 	;;
-      linux* | k*bsd*-gnu | kopensolaris*-gnu)
+      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
 	case $cc_basename in
 	  KCC*)
 	    # KAI C++ Compiler
@@ -4055,7 +4371,7 @@ m4_if([$1], [CXX], [
 	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
 	    ;;
 	  ecpc* )
-	    # old Intel C++ for x86_64 which still supported -KPIC.
+	    # old Intel C++ for x86_64, which still supported -KPIC.
 	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
 	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
 	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
@@ -4200,17 +4516,18 @@ m4_if([$1], [CXX], [
   fi
 ],
 [
-  if test "$GCC" = yes; then
+  if test yes = "$GCC"; then
     _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
     _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
 
     case $host_os in
       aix*)
       # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
+      if test ia64 = "$host_cpu"; then
 	# AIX 5 now supports IA64 processor
 	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
       fi
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
       ;;
 
     amigaos*)
@@ -4221,8 +4538,8 @@ m4_if([$1], [CXX], [
         ;;
       m68k)
             # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
+            # adding the '-m68020' flag to GCC prevents building anything better,
+            # like '-m68040'.
             _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
         ;;
       esac
@@ -4239,6 +4556,11 @@ m4_if([$1], [CXX], [
       # (--disable-auto-import) libraries
       m4_if([$1], [GCJ], [],
 	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      case $host_os in
+      os2*)
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static'
+	;;
+      esac
       ;;
 
     darwin* | rhapsody*)
@@ -4309,7 +4631,7 @@ m4_if([$1], [CXX], [
     case $host_os in
     aix*)
       _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      if test "$host_cpu" = ia64; then
+      if test ia64 = "$host_cpu"; then
 	# AIX 5 now supports IA64 processor
 	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
       else
@@ -4317,11 +4639,30 @@ m4_if([$1], [CXX], [
       fi
       ;;
 
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common'
+      case $cc_basename in
+      nagfor*)
+        # NAG Fortran compiler
+        _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,'
+        _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
+        _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
+        ;;
+      esac
+      ;;
+
     mingw* | cygwin* | pw32* | os2* | cegcc*)
       # This hack is so that the source file can tell whether it is being
       # built for inclusion in a dll (and should export symbols for example).
       m4_if([$1], [GCJ], [],
 	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
+      case $host_os in
+      os2*)
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static'
+	;;
+      esac
       ;;
 
     hpux9* | hpux10* | hpux11*)
@@ -4337,7 +4678,7 @@ m4_if([$1], [CXX], [
 	;;
       esac
       # Is there a better lt_prog_compiler_static that works with the bundled CC?
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
+      _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive'
       ;;
 
     irix5* | irix6* | nonstopux*)
@@ -4346,9 +4687,9 @@ m4_if([$1], [CXX], [
       _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
       ;;
 
-    linux* | k*bsd*-gnu | kopensolaris*-gnu)
+    linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
       case $cc_basename in
-      # old Intel for x86_64 which still supported -KPIC.
+      # old Intel for x86_64, which still supported -KPIC.
       ecc*)
 	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
 	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
@@ -4373,6 +4714,12 @@ m4_if([$1], [CXX], [
 	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
 	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
 	;;
+      tcc*)
+	# Fabrice Bellard et al's Tiny C Compiler
+	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
+	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
+	_LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
+	;;
       pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)
         # Portland Group compilers (*not* the Pentium gcc compiler,
 	# which looks to be a dead project)
@@ -4470,7 +4817,7 @@ m4_if([$1], [CXX], [
       ;;
 
     sysv4*MP*)
-      if test -d /usr/nec ;then
+      if test -d /usr/nec; then
 	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-Kconform_pic'
 	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
       fi
@@ -4499,7 +4846,7 @@ m4_if([$1], [CXX], [
   fi
 ])
 case $host_os in
-  # For platforms which do not support PIC, -DPIC is meaningless:
+  # For platforms that do not support PIC, -DPIC is meaningless:
   *djgpp*)
     _LT_TAGVAR(lt_prog_compiler_pic, $1)=
     ;;
@@ -4565,17 +4912,21 @@ m4_if([$1], [CXX], [
   case $host_os in
   aix[[4-9]]*)
     # If we're using GNU nm, then we don't want the "-C" option.
-    # -C means demangle to AIX nm, but means don't demangle with GNU nm
-    # Also, AIX nm treats weak defined symbols like other global defined
-    # symbols, whereas GNU nm marks them as "W".
+    # -C means demangle to GNU nm, but means don't demangle to AIX nm.
+    # Without the "-l" option, or with the "-B" option, AIX nm treats
+    # weak defined symbols like other global defined symbols, whereas
+    # GNU nm marks them as "W".
+    # While the 'weak' keyword is ignored in the Export File, we need
+    # it in the Import File for the 'aix-soname' feature, so we have
+    # to replace the "-B" option with "-P" for AIX nm.
     if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { if (\$ 2 == "W") { print \$ 3 " weak" } else { print \$ 3 } } }'\'' | sort -u > $export_symbols'
     else
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+      _LT_TAGVAR(export_symbols_cmds, $1)='`func_echo_all $NM | $SED -e '\''s/B\([[^B]]*\)$/P\1/'\''` -PCpgl $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) && ([substr](\$ 1,1,1) != ".")) { if ((\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) { print \$ 1 " weak" } else { print \$ 1 } } }'\'' | sort -u > $export_symbols'
     fi
     ;;
   pw32*)
-    _LT_TAGVAR(export_symbols_cmds, $1)="$ltdll_cmds"
+    _LT_TAGVAR(export_symbols_cmds, $1)=$ltdll_cmds
     ;;
   cygwin* | mingw* | cegcc*)
     case $cc_basename in
@@ -4621,9 +4972,9 @@ m4_if([$1], [CXX], [
   # included in the symbol list
   _LT_TAGVAR(include_expsyms, $1)=
   # exclude_expsyms can be an extended regexp of symbols to exclude
-  # it will be wrapped by ` (' and `)$', so one must not match beginning or
-  # end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
-  # as well as any symbol that contains `d'.
+  # it will be wrapped by ' (' and ')$', so one must not match beginning or
+  # end of line.  Example: 'a|bc|.*d.*' will exclude the symbols 'a' and 'bc',
+  # as well as any symbol that contains 'd'.
   _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*']
   # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
   # platforms (ab)use it in PIC code, but their linkers get confused if
@@ -4639,7 +4990,7 @@ dnl Note also adjust exclude_expsyms for C++ above.
     # FIXME: the MSVC++ port hasn't been tested in a loooong time
     # When not using gcc, we currently assume that we are using
     # Microsoft Visual C++.
-    if test "$GCC" != yes; then
+    if test yes != "$GCC"; then
       with_gnu_ld=no
     fi
     ;;
@@ -4647,7 +4998,7 @@ dnl Note also adjust exclude_expsyms for C++ above.
     # we just hope/assume this is gcc and not c89 (= MSVC++)
     with_gnu_ld=yes
     ;;
-  openbsd*)
+  openbsd* | bitrig*)
     with_gnu_ld=no
     ;;
   esac
@@ -4657,7 +5008,7 @@ dnl Note also adjust exclude_expsyms for C++ above.
   # On some targets, GNU ld is compatible enough with the native linker
   # that we're better off using the native interface for both.
   lt_use_gnu_ld_interface=no
-  if test "$with_gnu_ld" = yes; then
+  if test yes = "$with_gnu_ld"; then
     case $host_os in
       aix*)
 	# The AIX port of GNU ld has always aspired to compatibility
@@ -4679,24 +5030,24 @@ dnl Note also adjust exclude_expsyms for C++ above.
     esac
   fi
 
-  if test "$lt_use_gnu_ld_interface" = yes; then
+  if test yes = "$lt_use_gnu_ld_interface"; then
     # If archive_cmds runs LD, not CC, wlarc should be empty
-    wlarc='${wl}'
+    wlarc='$wl'
 
     # Set some defaults for GNU ld with shared library support. These
     # are reset later if shared libraries are not supported. Putting them
     # here allows them to be overridden if necessary.
     runpath_var=LD_RUN_PATH
-    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
+    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
     # ancient GNU ld didn't support --whole-archive et. al.
     if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then
-      _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive'
     else
       _LT_TAGVAR(whole_archive_flag_spec, $1)=
     fi
     supports_anon_versioning=no
-    case `$LD -v 2>&1` in
+    case `$LD -v | $SED -e 's/([^)]\+)\s\+//' 2>&1` in
       *GNU\ gold*) supports_anon_versioning=yes ;;
       *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.10.*) ;; # catch versions < 2.11
       *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ...
@@ -4709,7 +5060,7 @@ dnl Note also adjust exclude_expsyms for C++ above.
     case $host_os in
     aix[[3-9]]*)
       # On AIX/PPC, the GNU linker is very broken
-      if test "$host_cpu" != ia64; then
+      if test ia64 != "$host_cpu"; then
 	_LT_TAGVAR(ld_shlibs, $1)=no
 	cat <<_LT_EOF 1>&2
 
@@ -4728,7 +5079,7 @@ _LT_EOF
       case $host_cpu in
       powerpc)
             # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
             _LT_TAGVAR(archive_expsym_cmds, $1)=''
         ;;
       m68k)
@@ -4744,7 +5095,7 @@ _LT_EOF
 	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
 	# Joseph Beckenbach <jrb3@best.com> says some releases of gcc
 	# support --undefined.  This deserves some investigation.  FIXME
-	_LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
       else
 	_LT_TAGVAR(ld_shlibs, $1)=no
       fi
@@ -4754,7 +5105,7 @@ _LT_EOF
       # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
       # as there is no search path for DLLs.
       _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-all-symbols'
       _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
       _LT_TAGVAR(always_export_symbols, $1)=no
       _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
@@ -4762,61 +5113,89 @@ _LT_EOF
       _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname']
 
       if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	# If the export-symbols file already is a .def file (1st line
-	# is EXPORTS), use it as is; otherwise, prepend...
-	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	  cp $export_symbols $output_objdir/$soname.def;
-	else
-	  echo EXPORTS > $output_objdir/$soname.def;
-	  cat $export_symbols >> $output_objdir/$soname.def;
-	fi~
-	$CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	# If the export-symbols file already is a .def file, use it as
+	# is; otherwise, prepend EXPORTS...
+	_LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then
+          cp $export_symbols $output_objdir/$soname.def;
+        else
+          echo EXPORTS > $output_objdir/$soname.def;
+          cat $export_symbols >> $output_objdir/$soname.def;
+        fi~
+        $CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
       else
 	_LT_TAGVAR(ld_shlibs, $1)=no
       fi
       ;;
 
     haiku*)
-      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
       _LT_TAGVAR(link_all_deplibs, $1)=yes
       ;;
 
+    os2*)
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+      _LT_TAGVAR(hardcode_minus_L, $1)=yes
+      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+      shrext_cmds=.dll
+      _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	$ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	$ECHO EXPORTS >> $output_objdir/$libname.def~
+	emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~
+	$CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	emximp -o $lib $output_objdir/$libname.def'
+      _LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	$ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	$ECHO EXPORTS >> $output_objdir/$libname.def~
+	prefix_cmds="$SED"~
+	if test EXPORTS = "`$SED 1q $export_symbols`"; then
+	  prefix_cmds="$prefix_cmds -e 1d";
+	fi~
+	prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~
+	cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~
+	$CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	emximp -o $lib $output_objdir/$libname.def'
+      _LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def'
+      _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+      ;;
+
     interix[[3-9]]*)
       _LT_TAGVAR(hardcode_direct, $1)=no
       _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
       # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
       # Instead, shared libraries are loaded at an image base (0x10000000 by
       # default) and relocated if they conflict, which is a slow very memory
       # consuming and fragmenting process.  To avoid this, we pick a random,
       # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
       # time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-      _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+      _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s|^|_|" $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--retain-symbols-file,$output_objdir/$soname.expsym $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
       ;;
 
     gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu)
       tmp_diet=no
-      if test "$host_os" = linux-dietlibc; then
+      if test linux-dietlibc = "$host_os"; then
 	case $cc_basename in
 	  diet\ *) tmp_diet=yes;;	# linux-dietlibc with static linking (!diet-dyn)
 	esac
       fi
       if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \
-	 && test "$tmp_diet" = no
+	 && test no = "$tmp_diet"
       then
 	tmp_addflag=' $pic_flag'
 	tmp_sharedflag='-shared'
 	case $cc_basename,$host_cpu in
         pgcc*)				# Portland Group C compiler
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
 	  tmp_addflag=' $pic_flag'
 	  ;;
 	pgf77* | pgf90* | pgf95* | pgfortran*)
 					# Portland Group f77 and f90 compilers
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
 	  tmp_addflag=' $pic_flag -Mnomain' ;;
 	ecc*,ia64* | icc*,ia64*)	# Intel C compiler on ia64
 	  tmp_addflag=' -i_dynamic' ;;
@@ -4827,42 +5206,47 @@ _LT_EOF
 	lf95*)				# Lahey Fortran 8.1
 	  _LT_TAGVAR(whole_archive_flag_spec, $1)=
 	  tmp_sharedflag='--shared' ;;
+        nagfor*)                        # NAGFOR 5.3
+          tmp_sharedflag='-Wl,-shared' ;;
 	xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below)
 	  tmp_sharedflag='-qmkshrobj'
 	  tmp_addflag= ;;
 	nvcc*)	# Cuda Compiler Driver 2.2
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
 	  _LT_TAGVAR(compiler_needs_object, $1)=yes
 	  ;;
 	esac
 	case `$CC -V 2>&1 | sed 5q` in
 	*Sun\ C*)			# Sun C 5.9
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
 	  _LT_TAGVAR(compiler_needs_object, $1)=yes
 	  tmp_sharedflag='-G' ;;
 	*Sun\ F*)			# Sun Fortran 8.3
 	  tmp_sharedflag='-G' ;;
 	esac
-	_LT_TAGVAR(archive_cmds, $1)='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
 
-        if test "x$supports_anon_versioning" = xyes; then
+        if test yes = "$supports_anon_versioning"; then
           _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-	    cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-	    echo "local: *; };" >> $output_objdir/$libname.ver~
-	    $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+            cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+            echo "local: *; };" >> $output_objdir/$libname.ver~
+            $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-version-script $wl$output_objdir/$libname.ver -o $lib'
         fi
 
 	case $cc_basename in
+	tcc*)
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='-rdynamic'
+	  ;;
 	xlf* | bgf* | bgxlf* | mpixlf*)
 	  # IBM XL Fortran 10.1 on PPC cannot create shared libs itself
 	  _LT_TAGVAR(whole_archive_flag_spec, $1)='--whole-archive$convenience --no-whole-archive'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
 	  _LT_TAGVAR(archive_cmds, $1)='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib'
-	  if test "x$supports_anon_versioning" = xyes; then
+	  if test yes = "$supports_anon_versioning"; then
 	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-	      cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-	      echo "local: *; };" >> $output_objdir/$libname.ver~
-	      $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
+              cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+              echo "local: *; };" >> $output_objdir/$libname.ver~
+              $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
 	  fi
 	  ;;
 	esac
@@ -4876,8 +5260,8 @@ _LT_EOF
 	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
 	wlarc=
       else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
       fi
       ;;
 
@@ -4895,8 +5279,8 @@ _LT_EOF
 
 _LT_EOF
       elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
       else
 	_LT_TAGVAR(ld_shlibs, $1)=no
       fi
@@ -4908,7 +5292,7 @@ _LT_EOF
 	_LT_TAGVAR(ld_shlibs, $1)=no
 	cat <<_LT_EOF 1>&2
 
-*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 can not
+*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 cannot
 *** reliably create shared libraries on SCO systems.  Therefore, libtool
 *** is disabling shared libraries support.  We urge you to upgrade GNU
 *** binutils to release 2.16.91.0.3 or newer.  Another option is to modify
@@ -4923,9 +5307,9 @@ _LT_EOF
 	  # DT_RUNPATH tag from executables and libraries.  But doing so
 	  # requires that you compile everything twice, which is a pain.
 	  if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 	  else
 	    _LT_TAGVAR(ld_shlibs, $1)=no
 	  fi
@@ -4942,15 +5326,15 @@ _LT_EOF
 
     *)
       if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
       else
 	_LT_TAGVAR(ld_shlibs, $1)=no
       fi
       ;;
     esac
 
-    if test "$_LT_TAGVAR(ld_shlibs, $1)" = no; then
+    if test no = "$_LT_TAGVAR(ld_shlibs, $1)"; then
       runpath_var=
       _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
       _LT_TAGVAR(export_dynamic_flag_spec, $1)=
@@ -4966,7 +5350,7 @@ _LT_EOF
       # Note: this linker hardcodes the directories in LIBPATH if there
       # are no directories specified by -L.
       _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      if test "$GCC" = yes && test -z "$lt_prog_compiler_static"; then
+      if test yes = "$GCC" && test -z "$lt_prog_compiler_static"; then
 	# Neither direct hardcoding nor static linking is supported with a
 	# broken collect2.
 	_LT_TAGVAR(hardcode_direct, $1)=unsupported
@@ -4974,34 +5358,57 @@ _LT_EOF
       ;;
 
     aix[[4-9]]*)
-      if test "$host_cpu" = ia64; then
+      if test ia64 = "$host_cpu"; then
 	# On IA64, the linker does run time linking by default, so we don't
 	# have to do anything special.
 	aix_use_runtimelinking=no
 	exp_sym_flag='-Bexport'
-	no_entry_flag=""
+	no_entry_flag=
       else
 	# If we're using GNU nm, then we don't want the "-C" option.
-	# -C means demangle to AIX nm, but means don't demangle with GNU nm
-	# Also, AIX nm treats weak defined symbols like other global
-	# defined symbols, whereas GNU nm marks them as "W".
+	# -C means demangle to GNU nm, but means don't demangle to AIX nm.
+	# Without the "-l" option, or with the "-B" option, AIX nm treats
+	# weak defined symbols like other global defined symbols, whereas
+	# GNU nm marks them as "W".
+	# While the 'weak' keyword is ignored in the Export File, we need
+	# it in the Import File for the 'aix-soname' feature, so we have
+	# to replace the "-B" option with "-P" for AIX nm.
 	if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { if (\$ 2 == "W") { print \$ 3 " weak" } else { print \$ 3 } } }'\'' | sort -u > $export_symbols'
 	else
-	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
+	  _LT_TAGVAR(export_symbols_cmds, $1)='`func_echo_all $NM | $SED -e '\''s/B\([[^B]]*\)$/P\1/'\''` -PCpgl $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) && ([substr](\$ 1,1,1) != ".")) { if ((\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) { print \$ 1 " weak" } else { print \$ 1 } } }'\'' | sort -u > $export_symbols'
 	fi
 	aix_use_runtimelinking=no
 
 	# Test if we are trying to use run time linking or normal
 	# AIX style linking. If -brtl is somewhere in LDFLAGS, we
-	# need to do runtime linking.
+	# have runtime linking enabled, and use it for executables.
+	# For shared libraries, we enable/disable runtime linking
+	# depending on the kind of the shared library created -
+	# when "with_aix_soname,aix_use_runtimelinking" is:
+	# "aix,no"   lib.a(lib.so.V) shared, rtl:no,  for executables
+	# "aix,yes"  lib.so          shared, rtl:yes, for executables
+	#            lib.a           static archive
+	# "both,no"  lib.so.V(shr.o) shared, rtl:yes
+	#            lib.a(lib.so.V) shared, rtl:no,  for executables
+	# "both,yes" lib.so.V(shr.o) shared, rtl:yes, for executables
+	#            lib.a(lib.so.V) shared, rtl:no
+	# "svr4,*"   lib.so.V(shr.o) shared, rtl:yes, for executables
+	#            lib.a           static archive
 	case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
 	  for ld_flag in $LDFLAGS; do
-	  if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then
+	  if (test x-brtl = "x$ld_flag" || test x-Wl,-brtl = "x$ld_flag"); then
 	    aix_use_runtimelinking=yes
 	    break
 	  fi
 	  done
+	  if test svr4,no = "$with_aix_soname,$aix_use_runtimelinking"; then
+	    # With aix-soname=svr4, we create the lib.so.V shared archives only,
+	    # so we don't have lib.a shared libs to link our executables.
+	    # We have to force runtime linking in this case.
+	    aix_use_runtimelinking=yes
+	    LDFLAGS="$LDFLAGS -Wl,-brtl"
+	  fi
 	  ;;
 	esac
 
@@ -5020,13 +5427,21 @@ _LT_EOF
       _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
       _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
       _LT_TAGVAR(link_all_deplibs, $1)=yes
-      _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
+      _LT_TAGVAR(file_list_spec, $1)='$wl-f,'
+      case $with_aix_soname,$aix_use_runtimelinking in
+      aix,*) ;; # traditional, no import file
+      svr4,* | *,yes) # use import file
+	# The Import File defines what to hardcode.
+	_LT_TAGVAR(hardcode_direct, $1)=no
+	_LT_TAGVAR(hardcode_direct_absolute, $1)=no
+	;;
+      esac
 
-      if test "$GCC" = yes; then
+      if test yes = "$GCC"; then
 	case $host_os in aix4.[[012]]|aix4.[[012]].*)
 	# We only want to do this on AIX 4.2 and lower, the check
 	# below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
+	  collect2name=`$CC -print-prog-name=collect2`
 	  if test -f "$collect2name" &&
 	   strings "$collect2name" | $GREP resolve_lib_name >/dev/null
 	  then
@@ -5045,61 +5460,80 @@ _LT_EOF
 	  ;;
 	esac
 	shared_flag='-shared'
-	if test "$aix_use_runtimelinking" = yes; then
-	  shared_flag="$shared_flag "'${wl}-G'
+	if test yes = "$aix_use_runtimelinking"; then
+	  shared_flag="$shared_flag "'$wl-G'
 	fi
+	# Need to ensure runtime linking is disabled for the traditional
+	# shared library, or the linker may eventually find shared libraries
+	# /with/ Import File - we do not want to mix them.
+	shared_flag_aix='-shared'
+	shared_flag_svr4='-shared $wl-G'
       else
 	# not using gcc
-	if test "$host_cpu" = ia64; then
+	if test ia64 = "$host_cpu"; then
 	# VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
 	# chokes on -Wl,-G. The following line is correct:
 	  shared_flag='-G'
 	else
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag='${wl}-G'
+	  if test yes = "$aix_use_runtimelinking"; then
+	    shared_flag='$wl-G'
 	  else
-	    shared_flag='${wl}-bM:SRE'
+	    shared_flag='$wl-bM:SRE'
 	  fi
+	  shared_flag_aix='$wl-bM:SRE'
+	  shared_flag_svr4='$wl-G'
 	fi
       fi
 
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-bexpall'
       # It seems that -bexpall does not export symbols beginning with
       # underscore (_), so it is better to generate a list of symbols to export.
       _LT_TAGVAR(always_export_symbols, $1)=yes
-      if test "$aix_use_runtimelinking" = yes; then
+      if test aix,yes = "$with_aix_soname,$aix_use_runtimelinking"; then
 	# Warning - without using the other runtime loading flags (-brtl),
 	# -berok will link without error, but may produce a broken library.
 	_LT_TAGVAR(allow_undefined_flag, $1)='-berok'
         # Determine the default libpath from the value encoded in an
         # empty executable.
         _LT_SYS_MODULE_PATH_AIX([$1])
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
-        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath"
+        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs $wl'$no_entry_flag' $compiler_flags `if test -n "$allow_undefined_flag"; then func_echo_all "$wl$allow_undefined_flag"; else :; fi` $wl'$exp_sym_flag:\$export_symbols' '$shared_flag
       else
-	if test "$host_cpu" = ia64; then
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
+	if test ia64 = "$host_cpu"; then
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $libdir:/usr/lib:/lib'
 	  _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
-	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\$wl$no_entry_flag"' $compiler_flags $wl$allow_undefined_flag '"\$wl$exp_sym_flag:\$export_symbols"
 	else
 	 # Determine the default libpath from the value encoded in an
 	 # empty executable.
 	 _LT_SYS_MODULE_PATH_AIX([$1])
-	 _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+	 _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath"
 	  # Warning - without using the other run time loading flags,
 	  # -berok will link without error, but may produce a broken library.
-	  _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
-	  _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
-	  if test "$with_gnu_ld" = yes; then
+	  _LT_TAGVAR(no_undefined_flag, $1)=' $wl-bernotok'
+	  _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-berok'
+	  if test yes = "$with_gnu_ld"; then
 	    # We only use this code for GNU lds that support --whole-archive.
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive'
 	  else
 	    # Exported symbols can be pulled into shared objects from archives
 	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
 	  fi
 	  _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-	  # This is similar to how AIX traditionally builds its shared libraries.
-	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$RM -r $output_objdir/$realname.d~$MKDIR $output_objdir/$realname.d'
+	  # -brtl affects multiple linker settings, -berok does not and is overridden later
+	  compiler_flags_filtered='`func_echo_all "$compiler_flags " | $SED -e "s%-brtl\\([[, ]]\\)%-berok\\1%g"`'
+	  if test svr4 != "$with_aix_soname"; then
+	    # This is similar to how AIX traditionally builds its shared libraries.
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_aix' -o $output_objdir/$realname.d/$soname $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$realname.d/$soname'
+	  fi
+	  if test aix != "$with_aix_soname"; then
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_svr4' -o $output_objdir/$realname.d/$shared_archive_member_spec.o $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$STRIP -e $output_objdir/$realname.d/$shared_archive_member_spec.o~( func_echo_all "#! $soname($shared_archive_member_spec.o)"; if test shr_64 = "$shared_archive_member_spec"; then func_echo_all "# 64"; else func_echo_all "# 32"; fi; cat $export_symbols ) > $output_objdir/$realname.d/$shared_archive_member_spec.imp~$AR $AR_FLAGS $output_objdir/$soname $output_objdir/$realname.d/$shared_archive_member_spec.o $output_objdir/$realname.d/$shared_archive_member_spec.imp'
+	  else
+	    # used by -dlpreopen to get the symbols
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$MV  $output_objdir/$realname.d/$soname $output_objdir'
+	  fi
+	  _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$RM -r $output_objdir/$realname.d'
 	fi
       fi
       ;;
@@ -5108,7 +5542,7 @@ _LT_EOF
       case $host_cpu in
       powerpc)
             # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
             _LT_TAGVAR(archive_expsym_cmds, $1)=''
         ;;
       m68k)
@@ -5138,16 +5572,17 @@ _LT_EOF
 	# Tell ltmain to make .lib files, not .a files.
 	libext=lib
 	# Tell ltmain to make .dll files, not .so files.
-	shrext_cmds=".dll"
+	shrext_cmds=.dll
 	# FIXME: Setting linknames here is a bad hack.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
-	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	    sed -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
-	  else
-	    sed -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
-	  fi~
-	  $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
-	  linknames='
+	_LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~linknames='
+	_LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then
+            cp "$export_symbols" "$output_objdir/$soname.def";
+            echo "$tool_output_objdir$soname.def" > "$output_objdir/$soname.exp";
+          else
+            $SED -e '\''s/^/-link -EXPORT:/'\'' < $export_symbols > $output_objdir/$soname.exp;
+          fi~
+          $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+          linknames='
 	# The linker will not automatically build a static lib if we build a DLL.
 	# _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
 	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
@@ -5156,18 +5591,18 @@ _LT_EOF
 	# Don't use ranlib
 	_LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
 	_LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
-	  lt_tool_outputfile="@TOOL_OUTPUT@"~
-	  case $lt_outputfile in
-	    *.exe|*.EXE) ;;
-	    *)
-	      lt_outputfile="$lt_outputfile.exe"
-	      lt_tool_outputfile="$lt_tool_outputfile.exe"
-	      ;;
-	  esac~
-	  if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
-	    $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
-	    $RM "$lt_outputfile.manifest";
-	  fi'
+          lt_tool_outputfile="@TOOL_OUTPUT@"~
+          case $lt_outputfile in
+            *.exe|*.EXE) ;;
+            *)
+              lt_outputfile=$lt_outputfile.exe
+              lt_tool_outputfile=$lt_tool_outputfile.exe
+              ;;
+          esac~
+          if test : != "$MANIFEST_TOOL" && test -f "$lt_outputfile.manifest"; then
+            $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+            $RM "$lt_outputfile.manifest";
+          fi'
 	;;
       *)
 	# Assume MSVC wrapper
@@ -5176,7 +5611,7 @@ _LT_EOF
 	# Tell ltmain to make .lib files, not .a files.
 	libext=lib
 	# Tell ltmain to make .dll files, not .so files.
-	shrext_cmds=".dll"
+	shrext_cmds=.dll
 	# FIXME: Setting linknames here is a bad hack.
 	_LT_TAGVAR(archive_cmds, $1)='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames='
 	# The linker will automatically build a .lib file if we build a DLL.
@@ -5226,33 +5661,33 @@ _LT_EOF
       ;;
 
     hpux9*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib'
       else
-	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib'
       fi
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir'
       _LT_TAGVAR(hardcode_libdir_separator, $1)=:
       _LT_TAGVAR(hardcode_direct, $1)=yes
 
       # hardcode_minus_L: Not really in the search PATH,
       # but as the default location of the library.
       _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
       ;;
 
     hpux10*)
-      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+      if test yes,no = "$GCC,$with_gnu_ld"; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
       else
 	_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
       fi
-      if test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+      if test no = "$with_gnu_ld"; then
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir'
 	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
 	_LT_TAGVAR(hardcode_direct, $1)=yes
 	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
 	# hardcode_minus_L: Not really in the search PATH,
 	# but as the default location of the library.
 	_LT_TAGVAR(hardcode_minus_L, $1)=yes
@@ -5260,25 +5695,25 @@ _LT_EOF
       ;;
 
     hpux11*)
-      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
+      if test yes,no = "$GCC,$with_gnu_ld"; then
 	case $host_cpu in
 	hppa*64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl+h $wl$soname -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	ia64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	esac
       else
 	case $host_cpu in
 	hppa*64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	ia64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	*)
 	m4_if($1, [], [
@@ -5286,14 +5721,14 @@ _LT_EOF
 	  # (HP92453-01 A.11.01.20 doesn't, HP92453-01 B.11.X.35175-35176.GP does)
 	  _LT_LINKER_OPTION([if $CC understands -b],
 	    _LT_TAGVAR(lt_cv_prog_compiler__b, $1), [-b],
-	    [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'],
+	    [_LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags'],
 	    [_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'])],
-	  [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'])
+	  [_LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags'])
 	  ;;
 	esac
       fi
-      if test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+      if test no = "$with_gnu_ld"; then
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir'
 	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
 
 	case $host_cpu in
@@ -5304,7 +5739,7 @@ _LT_EOF
 	*)
 	  _LT_TAGVAR(hardcode_direct, $1)=yes
 	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
 
 	  # hardcode_minus_L: Not really in the search PATH,
 	  # but as the default location of the library.
@@ -5315,16 +5750,16 @@ _LT_EOF
       ;;
 
     irix5* | irix6* | nonstopux*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
 	# Try to use the -exported_symbol ld option, if it does not
 	# work, assume that -exports_file does not work either and
 	# implicitly export all symbols.
 	# This should be the same for all languages, so no per-tag cache variable.
 	AC_CACHE_CHECK([whether the $host_os linker accepts -exported_symbol],
 	  [lt_cv_irix_exported_symbol],
-	  [save_LDFLAGS="$LDFLAGS"
-	   LDFLAGS="$LDFLAGS -shared ${wl}-exported_symbol ${wl}foo ${wl}-update_registry ${wl}/dev/null"
+	  [save_LDFLAGS=$LDFLAGS
+	   LDFLAGS="$LDFLAGS -shared $wl-exported_symbol ${wl}foo $wl-update_registry $wl/dev/null"
 	   AC_LINK_IFELSE(
 	     [AC_LANG_SOURCE(
 	        [AC_LANG_CASE([C], [[int foo (void) { return 0; }]],
@@ -5337,21 +5772,31 @@ _LT_EOF
       end]])])],
 	      [lt_cv_irix_exported_symbol=yes],
 	      [lt_cv_irix_exported_symbol=no])
-           LDFLAGS="$save_LDFLAGS"])
-	if test "$lt_cv_irix_exported_symbol" = yes; then
-          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations ${wl}-exports_file ${wl}$export_symbols -o $lib'
+           LDFLAGS=$save_LDFLAGS])
+	if test yes = "$lt_cv_irix_exported_symbol"; then
+          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib'
 	fi
       else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -exports_file $export_symbols -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib'
       fi
       _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
       _LT_TAGVAR(hardcode_libdir_separator, $1)=:
       _LT_TAGVAR(inherit_rpath, $1)=yes
       _LT_TAGVAR(link_all_deplibs, $1)=yes
       ;;
 
+    linux*)
+      case $cc_basename in
+      tcc*)
+	# Fabrice Bellard et al's Tiny C Compiler
+	_LT_TAGVAR(ld_shlibs, $1)=yes
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	;;
+      esac
+      ;;
+
     netbsd*)
       if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
 	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
@@ -5366,7 +5811,7 @@ _LT_EOF
     newsos6)
       _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
       _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
       _LT_TAGVAR(hardcode_libdir_separator, $1)=:
       _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
       ;;
@@ -5374,27 +5819,19 @@ _LT_EOF
     *nto* | *qnx*)
       ;;
 
-    openbsd*)
+    openbsd* | bitrig*)
       if test -f /usr/libexec/ld.so; then
 	_LT_TAGVAR(hardcode_direct, $1)=yes
 	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
 	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then
 	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags ${wl}-retain-symbols-file,$export_symbols'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags $wl-retain-symbols-file,$export_symbols'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
 	else
-	  case $host_os in
-	   openbsd[[01]].* | openbsd2.[[0-7]] | openbsd2.[[0-7]].*)
-	     _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
-	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	     ;;
-	   *)
-	     _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
-	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	     ;;
-	  esac
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
 	fi
       else
 	_LT_TAGVAR(ld_shlibs, $1)=no
@@ -5405,33 +5842,53 @@ _LT_EOF
       _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
       _LT_TAGVAR(hardcode_minus_L, $1)=yes
       _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-      _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~echo DATA >> $output_objdir/$libname.def~echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
-      _LT_TAGVAR(old_archive_from_new_cmds, $1)='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+      shrext_cmds=.dll
+      _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	$ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	$ECHO EXPORTS >> $output_objdir/$libname.def~
+	emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~
+	$CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	emximp -o $lib $output_objdir/$libname.def'
+      _LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	$ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	$ECHO EXPORTS >> $output_objdir/$libname.def~
+	prefix_cmds="$SED"~
+	if test EXPORTS = "`$SED 1q $export_symbols`"; then
+	  prefix_cmds="$prefix_cmds -e 1d";
+	fi~
+	prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~
+	cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~
+	$CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	emximp -o $lib $output_objdir/$libname.def'
+      _LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def'
+      _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
       ;;
 
     osf3*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
       else
 	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
       fi
       _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
       _LT_TAGVAR(hardcode_libdir_separator, $1)=:
       ;;
 
     osf4* | osf5*)	# as osf3* with the addition of -msym flag
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $pic_flag $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $pic_flag $libobjs $deplibs $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
       else
 	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
 	_LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~
-	$CC -shared${allow_undefined_flag} ${wl}-input ${wl}$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~$RM $lib.exp'
+          $CC -shared$allow_undefined_flag $wl-input $wl$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib~$RM $lib.exp'
 
 	# Both c and cxx compiler support -rpath directly
 	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
@@ -5442,24 +5899,24 @@ _LT_EOF
 
     solaris*)
       _LT_TAGVAR(no_undefined_flag, $1)=' -z defs'
-      if test "$GCC" = yes; then
-	wlarc='${wl}'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+      if test yes = "$GCC"; then
+	wlarc='$wl'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl-z ${wl}text $wl-h $wl$soname -o $lib $libobjs $deplibs $compiler_flags'
 	_LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-M ${wl}$lib.exp ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+          $CC -shared $pic_flag $wl-z ${wl}text $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
       else
 	case `$CC -V 2>&1` in
 	*"Compilers 5.0"*)
 	  wlarc=''
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$LD -G$allow_undefined_flag -h $soname -o $lib $libobjs $deplibs $linker_flags'
 	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
+            $LD -G$allow_undefined_flag -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
 	  ;;
 	*)
-	  wlarc='${wl}'
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $compiler_flags'
+	  wlarc='$wl'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h $soname -o $lib $libobjs $deplibs $compiler_flags'
 	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $CC -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
+            $CC -G$allow_undefined_flag -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
 	  ;;
 	esac
       fi
@@ -5469,11 +5926,11 @@ _LT_EOF
       solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
       *)
 	# The compiler driver will combine and reorder linker options,
-	# but understands `-z linker_flag'.  GCC discards it without `$wl',
+	# but understands '-z linker_flag'.  GCC discards it without '$wl',
 	# but is careful enough not to reorder.
 	# Supported since Solaris 2.6 (maybe 2.5.1?)
-	if test "$GCC" = yes; then
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+	if test yes = "$GCC"; then
+	  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl-z ${wl}allextract$convenience $wl-z ${wl}defaultextract'
 	else
 	  _LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
 	fi
@@ -5483,10 +5940,10 @@ _LT_EOF
       ;;
 
     sunos4*)
-      if test "x$host_vendor" = xsequent; then
+      if test sequent = "$host_vendor"; then
 	# Use $CC to link under sequent, because it throws in some extra .o
 	# files that make .init and .fini sections work.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h $soname -o $lib $libobjs $deplibs $compiler_flags'
       else
 	_LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
       fi
@@ -5535,43 +5992,43 @@ _LT_EOF
       ;;
 
     sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+      _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text'
       _LT_TAGVAR(archive_cmds_need_lc, $1)=no
       _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
       runpath_var='LD_RUN_PATH'
 
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
       else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
       fi
       ;;
 
     sysv5* | sco3.2v5* | sco5v6*)
-      # Note: We can NOT use -z defs as we might desire, because we do not
+      # Note: We CANNOT use -z defs as we might desire, because we do not
       # link with -lc, and that would cause any symbols used from libc to
       # always be unresolved, which means just about no library would
       # ever link correctly.  If we're not using GNU ld we use -z text
       # though, which does catch some bad symbols but isn't as heavy-handed
       # as -z defs.
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-      _LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
+      _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text'
+      _LT_TAGVAR(allow_undefined_flag, $1)='$wl-z,nodefs'
       _LT_TAGVAR(archive_cmds_need_lc, $1)=no
       _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
+      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R,$libdir'
       _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
       _LT_TAGVAR(link_all_deplibs, $1)=yes
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
+      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Bexport'
       runpath_var='LD_RUN_PATH'
 
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+      if test yes = "$GCC"; then
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
       else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
       fi
       ;;
 
@@ -5586,17 +6043,17 @@ _LT_EOF
       ;;
     esac
 
-    if test x$host_vendor = xsni; then
+    if test sni = "$host_vendor"; then
       case $host in
       sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Blargedynsym'
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Blargedynsym'
 	;;
       esac
     fi
   fi
 ])
 AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
-test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
+test no = "$_LT_TAGVAR(ld_shlibs, $1)" && can_build_shared=no
 
 _LT_TAGVAR(with_gnu_ld, $1)=$with_gnu_ld
 
@@ -5613,7 +6070,7 @@ x|xyes)
   # Assume -lc should be added
   _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
 
-  if test "$enable_shared" = yes && test "$GCC" = yes; then
+  if test yes,yes = "$GCC,$enable_shared"; then
     case $_LT_TAGVAR(archive_cmds, $1) in
     *'~'*)
       # FIXME: we may have to deal with multi-command sequences.
@@ -5693,12 +6150,12 @@ _LT_TAGDECL([], [hardcode_libdir_flag_spec], [1],
 _LT_TAGDECL([], [hardcode_libdir_separator], [1],
     [Whether we need a single "-rpath" flag with a separated argument])
 _LT_TAGDECL([], [hardcode_direct], [0],
-    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
+    [Set to "yes" if using DIR/libNAME$shared_ext during linking hardcodes
     DIR into the resulting binary])
 _LT_TAGDECL([], [hardcode_direct_absolute], [0],
-    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
+    [Set to "yes" if using DIR/libNAME$shared_ext during linking hardcodes
     DIR into the resulting binary and the resulting library dependency is
-    "absolute", i.e impossible to change by setting ${shlibpath_var} if the
+    "absolute", i.e impossible to change by setting $shlibpath_var if the
     library is relocated])
 _LT_TAGDECL([], [hardcode_minus_L], [0],
     [Set to "yes" if using the -LDIR flag during linking hardcodes DIR
@@ -5739,10 +6196,10 @@ dnl    [Compiler flag to generate thread safe objects])
 # ------------------------
 # Ensure that the configuration variables for a C compiler are suitably
 # defined.  These variables are subsequently used by _LT_CONFIG to write
-# the compiler configuration to `libtool'.
+# the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_C_CONFIG],
 [m4_require([_LT_DECL_EGREP])dnl
-lt_save_CC="$CC"
+lt_save_CC=$CC
 AC_LANG_PUSH(C)
 
 # Source file extension for C test sources.
@@ -5782,18 +6239,18 @@ if test -n "$compiler"; then
   LT_SYS_DLOPEN_SELF
   _LT_CMD_STRIPLIB
 
-  # Report which library types will actually be built
+  # Report what library types will actually be built
   AC_MSG_CHECKING([if libtool supports shared libraries])
   AC_MSG_RESULT([$can_build_shared])
 
   AC_MSG_CHECKING([whether to build shared libraries])
-  test "$can_build_shared" = "no" && enable_shared=no
+  test no = "$can_build_shared" && enable_shared=no
 
   # On AIX, shared libraries and static libraries use the same namespace, and
   # are all built from PIC.
   case $host_os in
   aix3*)
-    test "$enable_shared" = yes && enable_static=no
+    test yes = "$enable_shared" && enable_static=no
     if test -n "$RANLIB"; then
       archive_cmds="$archive_cmds~\$RANLIB \$lib"
       postinstall_cmds='$RANLIB $lib'
@@ -5801,8 +6258,12 @@ if test -n "$compiler"; then
     ;;
 
   aix[[4-9]]*)
-    if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-      test "$enable_shared" = yes && enable_static=no
+    if test ia64 != "$host_cpu"; then
+      case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in
+      yes,aix,yes) ;;			# shared object as lib.so file only
+      yes,svr4,*) ;;			# shared object as lib.so archive member only
+      yes,*) enable_static=no ;;	# shared object in lib.a archive as well
+      esac
     fi
     ;;
   esac
@@ -5810,13 +6271,13 @@ if test -n "$compiler"; then
 
   AC_MSG_CHECKING([whether to build static libraries])
   # Make sure either enable_shared or enable_static is yes.
-  test "$enable_shared" = yes || enable_static=yes
+  test yes = "$enable_shared" || enable_static=yes
   AC_MSG_RESULT([$enable_static])
 
   _LT_CONFIG($1)
 fi
 AC_LANG_POP
-CC="$lt_save_CC"
+CC=$lt_save_CC
 ])# _LT_LANG_C_CONFIG
 
 
@@ -5824,14 +6285,14 @@ CC="$lt_save_CC"
 # --------------------------
 # Ensure that the configuration variables for a C++ compiler are suitably
 # defined.  These variables are subsequently used by _LT_CONFIG to write
-# the compiler configuration to `libtool'.
+# the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_CXX_CONFIG],
 [m4_require([_LT_FILEUTILS_DEFAULTS])dnl
 m4_require([_LT_DECL_EGREP])dnl
 m4_require([_LT_PATH_MANIFEST_TOOL])dnl
-if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
-    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
-    (test "X$CXX" != "Xg++"))) ; then
+if test -n "$CXX" && ( test no != "$CXX" &&
+    ( (test g++ = "$CXX" && `g++ -v >/dev/null 2>&1` ) ||
+    (test g++ != "$CXX"))); then
   AC_PROG_CXXCPP
 else
   _lt_caught_CXX_error=yes
@@ -5873,7 +6334,7 @@ _LT_TAGVAR(objext, $1)=$objext
 # the CXX compiler isn't working.  Some variables (like enable_shared)
 # are currently assumed to apply to all compilers on this platform,
 # and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_caught_CXX_error" != yes; then
+if test yes != "$_lt_caught_CXX_error"; then
   # Code to be used in simple compile tests
   lt_simple_compile_test_code="int some_variable = 0;"
 
@@ -5915,35 +6376,35 @@ if test "$_lt_caught_CXX_error" != yes; then
   if test -n "$compiler"; then
     # We don't want -fno-exception when compiling C++ code, so set the
     # no_builtin_flag separately
-    if test "$GXX" = yes; then
+    if test yes = "$GXX"; then
       _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin'
     else
       _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
     fi
 
-    if test "$GXX" = yes; then
+    if test yes = "$GXX"; then
       # Set up default GNU C++ configuration
 
       LT_PATH_LD
 
       # Check if GNU C++ uses GNU ld as the underlying linker, since the
       # archiving commands below assume that GNU ld is being used.
-      if test "$with_gnu_ld" = yes; then
-        _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+      if test yes = "$with_gnu_ld"; then
+        _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib'
+        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
 
         # If archive_cmds runs LD, not CC, wlarc should be empty
         # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
         #     investigate it a little bit more. (MM)
-        wlarc='${wl}'
+        wlarc='$wl'
 
         # ancient GNU ld didn't support --whole-archive et. al.
         if eval "`$CC -print-prog-name=ld` --help 2>&1" |
 	  $GREP 'no-whole-archive' > /dev/null; then
-          _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+          _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive'
         else
           _LT_TAGVAR(whole_archive_flag_spec, $1)=
         fi
@@ -5979,18 +6440,30 @@ if test "$_lt_caught_CXX_error" != yes; then
         _LT_TAGVAR(ld_shlibs, $1)=no
         ;;
       aix[[4-9]]*)
-        if test "$host_cpu" = ia64; then
+        if test ia64 = "$host_cpu"; then
           # On IA64, the linker does run time linking by default, so we don't
           # have to do anything special.
           aix_use_runtimelinking=no
           exp_sym_flag='-Bexport'
-          no_entry_flag=""
+          no_entry_flag=
         else
           aix_use_runtimelinking=no
 
           # Test if we are trying to use run time linking or normal
           # AIX style linking. If -brtl is somewhere in LDFLAGS, we
-          # need to do runtime linking.
+          # have runtime linking enabled, and use it for executables.
+          # For shared libraries, we enable/disable runtime linking
+          # depending on the kind of the shared library created -
+          # when "with_aix_soname,aix_use_runtimelinking" is:
+          # "aix,no"   lib.a(lib.so.V) shared, rtl:no,  for executables
+          # "aix,yes"  lib.so          shared, rtl:yes, for executables
+          #            lib.a           static archive
+          # "both,no"  lib.so.V(shr.o) shared, rtl:yes
+          #            lib.a(lib.so.V) shared, rtl:no,  for executables
+          # "both,yes" lib.so.V(shr.o) shared, rtl:yes, for executables
+          #            lib.a(lib.so.V) shared, rtl:no
+          # "svr4,*"   lib.so.V(shr.o) shared, rtl:yes, for executables
+          #            lib.a           static archive
           case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
 	    for ld_flag in $LDFLAGS; do
 	      case $ld_flag in
@@ -6000,6 +6473,13 @@ if test "$_lt_caught_CXX_error" != yes; then
 	        ;;
 	      esac
 	    done
+	    if test svr4,no = "$with_aix_soname,$aix_use_runtimelinking"; then
+	      # With aix-soname=svr4, we create the lib.so.V shared archives only,
+	      # so we don't have lib.a shared libs to link our executables.
+	      # We have to force runtime linking in this case.
+	      aix_use_runtimelinking=yes
+	      LDFLAGS="$LDFLAGS -Wl,-brtl"
+	    fi
 	    ;;
           esac
 
@@ -6018,13 +6498,21 @@ if test "$_lt_caught_CXX_error" != yes; then
         _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
         _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
         _LT_TAGVAR(link_all_deplibs, $1)=yes
-        _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
+        _LT_TAGVAR(file_list_spec, $1)='$wl-f,'
+        case $with_aix_soname,$aix_use_runtimelinking in
+        aix,*) ;;	# no import file
+        svr4,* | *,yes) # use import file
+          # The Import File defines what to hardcode.
+          _LT_TAGVAR(hardcode_direct, $1)=no
+          _LT_TAGVAR(hardcode_direct_absolute, $1)=no
+          ;;
+        esac
 
-        if test "$GXX" = yes; then
+        if test yes = "$GXX"; then
           case $host_os in aix4.[[012]]|aix4.[[012]].*)
           # We only want to do this on AIX 4.2 and lower, the check
           # below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
+	  collect2name=`$CC -print-prog-name=collect2`
 	  if test -f "$collect2name" &&
 	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
 	  then
@@ -6042,64 +6530,84 @@ if test "$_lt_caught_CXX_error" != yes; then
 	  fi
           esac
           shared_flag='-shared'
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag="$shared_flag "'${wl}-G'
+	  if test yes = "$aix_use_runtimelinking"; then
+	    shared_flag=$shared_flag' $wl-G'
 	  fi
+	  # Need to ensure runtime linking is disabled for the traditional
+	  # shared library, or the linker may eventually find shared libraries
+	  # /with/ Import File - we do not want to mix them.
+	  shared_flag_aix='-shared'
+	  shared_flag_svr4='-shared $wl-G'
         else
           # not using gcc
-          if test "$host_cpu" = ia64; then
+          if test ia64 = "$host_cpu"; then
 	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
 	  # chokes on -Wl,-G. The following line is correct:
 	  shared_flag='-G'
           else
-	    if test "$aix_use_runtimelinking" = yes; then
-	      shared_flag='${wl}-G'
+	    if test yes = "$aix_use_runtimelinking"; then
+	      shared_flag='$wl-G'
 	    else
-	      shared_flag='${wl}-bM:SRE'
+	      shared_flag='$wl-bM:SRE'
 	    fi
+	    shared_flag_aix='$wl-bM:SRE'
+	    shared_flag_svr4='$wl-G'
           fi
         fi
 
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-bexpall'
         # It seems that -bexpall does not export symbols beginning with
         # underscore (_), so it is better to generate a list of symbols to
 	# export.
         _LT_TAGVAR(always_export_symbols, $1)=yes
-        if test "$aix_use_runtimelinking" = yes; then
+	if test aix,yes = "$with_aix_soname,$aix_use_runtimelinking"; then
           # Warning - without using the other runtime loading flags (-brtl),
           # -berok will link without error, but may produce a broken library.
-          _LT_TAGVAR(allow_undefined_flag, $1)='-berok'
+          # The "-G" linker flag allows undefined symbols.
+          _LT_TAGVAR(no_undefined_flag, $1)='-bernotok'
           # Determine the default libpath from the value encoded in an empty
           # executable.
           _LT_SYS_MODULE_PATH_AIX([$1])
-          _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+          _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath"
 
-          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs $wl'$no_entry_flag' $compiler_flags `if test -n "$allow_undefined_flag"; then func_echo_all "$wl$allow_undefined_flag"; else :; fi` $wl'$exp_sym_flag:\$export_symbols' '$shared_flag
         else
-          if test "$host_cpu" = ia64; then
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
+          if test ia64 = "$host_cpu"; then
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $libdir:/usr/lib:/lib'
 	    _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
-	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\$wl$no_entry_flag"' $compiler_flags $wl$allow_undefined_flag '"\$wl$exp_sym_flag:\$export_symbols"
           else
 	    # Determine the default libpath from the value encoded in an
 	    # empty executable.
 	    _LT_SYS_MODULE_PATH_AIX([$1])
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath"
 	    # Warning - without using the other run time loading flags,
 	    # -berok will link without error, but may produce a broken library.
-	    _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
-	    _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
-	    if test "$with_gnu_ld" = yes; then
+	    _LT_TAGVAR(no_undefined_flag, $1)=' $wl-bernotok'
+	    _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-berok'
+	    if test yes = "$with_gnu_ld"; then
 	      # We only use this code for GNU lds that support --whole-archive.
-	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	      _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive'
 	    else
 	      # Exported symbols can be pulled into shared objects from archives
 	      _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
 	    fi
 	    _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-	    # This is similar to how AIX traditionally builds its shared
-	    # libraries.
-	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$RM -r $output_objdir/$realname.d~$MKDIR $output_objdir/$realname.d'
+	    # -brtl affects multiple linker settings, -berok does not and is overridden later
+	    compiler_flags_filtered='`func_echo_all "$compiler_flags " | $SED -e "s%-brtl\\([[, ]]\\)%-berok\\1%g"`'
+	    if test svr4 != "$with_aix_soname"; then
+	      # This is similar to how AIX traditionally builds its shared
+	      # libraries. Need -bnortl late, we may have -brtl in LDFLAGS.
+	      _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_aix' -o $output_objdir/$realname.d/$soname $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$realname.d/$soname'
+	    fi
+	    if test aix != "$with_aix_soname"; then
+	      _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_svr4' -o $output_objdir/$realname.d/$shared_archive_member_spec.o $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$STRIP -e $output_objdir/$realname.d/$shared_archive_member_spec.o~( func_echo_all "#! $soname($shared_archive_member_spec.o)"; if test shr_64 = "$shared_archive_member_spec"; then func_echo_all "# 64"; else func_echo_all "# 32"; fi; cat $export_symbols ) > $output_objdir/$realname.d/$shared_archive_member_spec.imp~$AR $AR_FLAGS $output_objdir/$soname $output_objdir/$realname.d/$shared_archive_member_spec.o $output_objdir/$realname.d/$shared_archive_member_spec.imp'
+	    else
+	      # used by -dlpreopen to get the symbols
+	      _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$MV  $output_objdir/$realname.d/$soname $output_objdir'
+	    fi
+	    _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$RM -r $output_objdir/$realname.d'
           fi
         fi
         ;;
@@ -6109,7 +6617,7 @@ if test "$_lt_caught_CXX_error" != yes; then
 	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
 	  # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
 	  # support --undefined.  This deserves some investigation.  FIXME
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
 	else
 	  _LT_TAGVAR(ld_shlibs, $1)=no
 	fi
@@ -6137,57 +6645,58 @@ if test "$_lt_caught_CXX_error" != yes; then
 	  # Tell ltmain to make .lib files, not .a files.
 	  libext=lib
 	  # Tell ltmain to make .dll files, not .so files.
-	  shrext_cmds=".dll"
+	  shrext_cmds=.dll
 	  # FIXME: Setting linknames here is a bad hack.
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      $SED -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
-	    else
-	      $SED -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
-	    fi~
-	    $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
-	    linknames='
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~linknames='
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then
+              cp "$export_symbols" "$output_objdir/$soname.def";
+              echo "$tool_output_objdir$soname.def" > "$output_objdir/$soname.exp";
+            else
+              $SED -e '\''s/^/-link -EXPORT:/'\'' < $export_symbols > $output_objdir/$soname.exp;
+            fi~
+            $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
+            linknames='
 	  # The linker will not automatically build a static lib if we build a DLL.
 	  # _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
 	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
 	  # Don't use ranlib
 	  _LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
 	  _LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
-	    lt_tool_outputfile="@TOOL_OUTPUT@"~
-	    case $lt_outputfile in
-	      *.exe|*.EXE) ;;
-	      *)
-		lt_outputfile="$lt_outputfile.exe"
-		lt_tool_outputfile="$lt_tool_outputfile.exe"
-		;;
-	    esac~
-	    func_to_tool_file "$lt_outputfile"~
-	    if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
-	      $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
-	      $RM "$lt_outputfile.manifest";
-	    fi'
+            lt_tool_outputfile="@TOOL_OUTPUT@"~
+            case $lt_outputfile in
+              *.exe|*.EXE) ;;
+              *)
+                lt_outputfile=$lt_outputfile.exe
+                lt_tool_outputfile=$lt_tool_outputfile.exe
+                ;;
+            esac~
+            func_to_tool_file "$lt_outputfile"~
+            if test : != "$MANIFEST_TOOL" && test -f "$lt_outputfile.manifest"; then
+              $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
+              $RM "$lt_outputfile.manifest";
+            fi'
 	  ;;
 	*)
 	  # g++
 	  # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
 	  # as there is no search path for DLLs.
 	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
+	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-all-symbols'
 	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
 	  _LT_TAGVAR(always_export_symbols, $1)=no
 	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
 
 	  if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	    # If the export-symbols file already is a .def file (1st line
-	    # is EXPORTS), use it as is; otherwise, prepend...
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      cp $export_symbols $output_objdir/$soname.def;
-	    else
-	      echo EXPORTS > $output_objdir/$soname.def;
-	      cat $export_symbols >> $output_objdir/$soname.def;
-	    fi~
-	    $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
+	    # If the export-symbols file already is a .def file, use it as
+	    # is; otherwise, prepend EXPORTS...
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then
+              cp $export_symbols $output_objdir/$soname.def;
+            else
+              echo EXPORTS > $output_objdir/$soname.def;
+              cat $export_symbols >> $output_objdir/$soname.def;
+            fi~
+            $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
 	  else
 	    _LT_TAGVAR(ld_shlibs, $1)=no
 	  fi
@@ -6198,6 +6707,34 @@ if test "$_lt_caught_CXX_error" != yes; then
         _LT_DARWIN_LINKER_FEATURES($1)
 	;;
 
+      os2*)
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
+	_LT_TAGVAR(hardcode_minus_L, $1)=yes
+	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
+	shrext_cmds=.dll
+	_LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	  $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	  $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	  $ECHO EXPORTS >> $output_objdir/$libname.def~
+	  emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~
+	  $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	  emximp -o $lib $output_objdir/$libname.def'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~
+	  $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~
+	  $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~
+	  $ECHO EXPORTS >> $output_objdir/$libname.def~
+	  prefix_cmds="$SED"~
+	  if test EXPORTS = "`$SED 1q $export_symbols`"; then
+	    prefix_cmds="$prefix_cmds -e 1d";
+	  fi~
+	  prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~
+	  cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~
+	  $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~
+	  emximp -o $lib $output_objdir/$libname.def'
+	_LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def'
+	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
+	;;
+
       dgux*)
         case $cc_basename in
           ec++*)
@@ -6232,18 +6769,15 @@ if test "$_lt_caught_CXX_error" != yes; then
         _LT_TAGVAR(ld_shlibs, $1)=yes
         ;;
 
-      gnu*)
-        ;;
-
       haiku*)
-        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
         _LT_TAGVAR(link_all_deplibs, $1)=yes
         ;;
 
       hpux9*)
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir'
         _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+        _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
         _LT_TAGVAR(hardcode_direct, $1)=yes
         _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH,
 				             # but as the default
@@ -6255,7 +6789,7 @@ if test "$_lt_caught_CXX_error" != yes; then
             _LT_TAGVAR(ld_shlibs, $1)=no
             ;;
           aCC*)
-            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -b $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib'
             # Commands to make compiler produce verbose output that lists
             # what "hidden" libraries, object files and flags are used when
             # linking a shared library.
@@ -6264,11 +6798,11 @@ if test "$_lt_caught_CXX_error" != yes; then
             # explicitly linking system object files so we need to strip them
             # from the output so that they don't get included in the library
             # dependencies.
-            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
             ;;
           *)
-            if test "$GXX" = yes; then
-              _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
+            if test yes = "$GXX"; then
+              _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib'
             else
               # FIXME: insert proper C++ library support
               _LT_TAGVAR(ld_shlibs, $1)=no
@@ -6278,15 +6812,15 @@ if test "$_lt_caught_CXX_error" != yes; then
         ;;
 
       hpux10*|hpux11*)
-        if test $with_gnu_ld = no; then
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
+        if test no = "$with_gnu_ld"; then
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir'
 	  _LT_TAGVAR(hardcode_libdir_separator, $1)=:
 
           case $host_cpu in
             hppa*64*|ia64*)
               ;;
             *)
-	      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	      _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
               ;;
           esac
         fi
@@ -6312,13 +6846,13 @@ if test "$_lt_caught_CXX_error" != yes; then
           aCC*)
 	    case $host_cpu in
 	      hppa*64*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	        ;;
 	      ia64*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	        ;;
 	      *)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	        ;;
 	    esac
 	    # Commands to make compiler produce verbose output that lists
@@ -6329,20 +6863,20 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # explicitly linking system object files so we need to strip them
 	    # from the output so that they don't get included in the library
 	    # dependencies.
-	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
 	    ;;
           *)
-	    if test "$GXX" = yes; then
-	      if test $with_gnu_ld = no; then
+	    if test yes = "$GXX"; then
+	      if test no = "$with_gnu_ld"; then
 	        case $host_cpu in
 	          hppa*64*)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	            ;;
 	          ia64*)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	            ;;
 	          *)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	            ;;
 	        esac
 	      fi
@@ -6357,22 +6891,22 @@ if test "$_lt_caught_CXX_error" != yes; then
       interix[[3-9]]*)
 	_LT_TAGVAR(hardcode_direct, $1)=no
 	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
 	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
 	# Instead, shared libraries are loaded at an image base (0x10000000 by
 	# default) and relocated if they conflict, which is a slow very memory
 	# consuming and fragmenting process.  To avoid this, we pick a random,
 	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
 	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
+	_LT_TAGVAR(archive_expsym_cmds, $1)='sed "s|^|_|" $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--retain-symbols-file,$output_objdir/$soname.expsym $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
 	;;
       irix5* | irix6*)
         case $cc_basename in
           CC*)
 	    # SGI C++
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
 
 	    # Archives containing C++ object files must be created using
 	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
@@ -6381,22 +6915,22 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -ar -WR,-u -o $oldlib $oldobjs'
 	    ;;
           *)
-	    if test "$GXX" = yes; then
-	      if test "$with_gnu_ld" = no; then
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	    if test yes = "$GXX"; then
+	      if test no = "$with_gnu_ld"; then
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
 	      else
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib'
 	      fi
 	    fi
 	    _LT_TAGVAR(link_all_deplibs, $1)=yes
 	    ;;
         esac
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
         _LT_TAGVAR(hardcode_libdir_separator, $1)=:
         _LT_TAGVAR(inherit_rpath, $1)=yes
         ;;
 
-      linux* | k*bsd*-gnu | kopensolaris*-gnu)
+      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
         case $cc_basename in
           KCC*)
 	    # Kuck and Associates, Inc. (KAI) C++ Compiler
@@ -6404,8 +6938,8 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # KCC will only create a shared library if the output file
 	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
 	    # to its proper name (with version) after linking.
-	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
+	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib $wl-retain-symbols-file,$export_symbols; mv \$templib $lib'
 	    # Commands to make compiler produce verbose output that lists
 	    # what "hidden" libraries, object files and flags are used when
 	    # linking a shared library.
@@ -6414,10 +6948,10 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # explicitly linking system object files so we need to strip them
 	    # from the output so that they don't get included in the library
 	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
 
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
 
 	    # Archives containing C++ object files must be created using
 	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
@@ -6431,59 +6965,59 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # earlier do not add the objects themselves.
 	    case `$CC -V 2>&1` in
 	      *"Version 7."*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib'
+		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 		;;
 	      *)  # Version 8.0 or newer
 	        tmp_idyn=
 	        case $host_cpu in
 		  ia64*) tmp_idyn=' -i_dynamic';;
 		esac
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 		;;
 	    esac
 	    _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive'
 	    ;;
           pgCC* | pgcpp*)
             # Portland Group C++ compiler
 	    case `$CC -V` in
 	    *pgCC\ [[1-5]].* | *pgcpp\ [[1-5]].*)
 	      _LT_TAGVAR(prelink_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
-		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
+               rm -rf $tpldir~
+               $CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
+               compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
 	      _LT_TAGVAR(old_archive_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
-		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
-		$RANLIB $oldlib'
+                rm -rf $tpldir~
+                $CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
+                $AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
+                $RANLIB $oldlib'
 	      _LT_TAGVAR(archive_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
+                rm -rf $tpldir~
+                $CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+                $CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib'
 	      _LT_TAGVAR(archive_expsym_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+                rm -rf $tpldir~
+                $CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
+                $CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 	      ;;
 	    *) # Version 6 and above use weak symbols
-	      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
+	      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib'
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib'
 	      ;;
 	    esac
 
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}--rpath ${wl}$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl--rpath $wl$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
             ;;
 	  cxx*)
 	    # Compaq C++
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname  -o $lib $wl-retain-symbols-file $wl$export_symbols'
 
 	    runpath_var=LD_RUN_PATH
 	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
@@ -6497,18 +7031,18 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # explicitly linking system object files so we need to strip them
 	    # from the output so that they don't get included in the library
 	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
 	    ;;
 	  xl* | mpixl* | bgxl*)
 	    # IBM XL 8.0 on PPC, with GNU ld
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    if test "x$supports_anon_versioning" = xyes; then
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -qmkshrobj $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib'
+	    if test yes = "$supports_anon_versioning"; then
 	      _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-		echo "local: *; };" >> $output_objdir/$libname.ver~
-		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
+                cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
+                echo "local: *; };" >> $output_objdir/$libname.ver~
+                $CC -qmkshrobj $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-version-script $wl$output_objdir/$libname.ver -o $lib'
 	    fi
 	    ;;
 	  *)
@@ -6516,10 +7050,10 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    *Sun\ C*)
 	      # Sun C++ 5.9
 	      _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
-	      _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
+	      _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-retain-symbols-file $wl$export_symbols'
 	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
+	      _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive'
 	      _LT_TAGVAR(compiler_needs_object, $1)=yes
 
 	      # Not sure whether something based on
@@ -6577,22 +7111,17 @@ if test "$_lt_caught_CXX_error" != yes; then
         _LT_TAGVAR(ld_shlibs, $1)=yes
 	;;
 
-      openbsd2*)
-        # C++ shared libraries are fairly broken
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	;;
-
-      openbsd*)
+      openbsd* | bitrig*)
 	if test -f /usr/libexec/ld.so; then
 	  _LT_TAGVAR(hardcode_direct, $1)=yes
 	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
 	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
 	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
+	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`"; then
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-retain-symbols-file,$export_symbols -o $lib'
+	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E'
+	    _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive'
 	  fi
 	  output_verbose_link_cmd=func_echo_all
 	else
@@ -6608,9 +7137,9 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # KCC will only create a shared library if the output file
 	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
 	    # to its proper name (with version) after linking.
-	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
+	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
 
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
+	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir'
 	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
 
 	    # Archives containing C++ object files must be created using
@@ -6628,17 +7157,17 @@ if test "$_lt_caught_CXX_error" != yes; then
           cxx*)
 	    case $host in
 	      osf3*)
-	        _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	        _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $soname `test -n "$verstring" && func_echo_all "$wl-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
+	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
 		;;
 	      *)
 	        _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
 	        _LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
-	          echo "-hidden">> $lib.exp~
-	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
-	          $RM $lib.exp'
+                  echo "-hidden">> $lib.exp~
+                  $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname $wl-input $wl$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib~
+                  $RM $lib.exp'
 	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
 		;;
 	    esac
@@ -6653,21 +7182,21 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # explicitly linking system object files so we need to strip them
 	    # from the output so that they don't get included in the library
 	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
+	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
 	    ;;
 	  *)
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
+	    if test yes,no = "$GXX,$with_gnu_ld"; then
+	      _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*'
 	      case $host in
 	        osf3*)
-	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
 		  ;;
 	        *)
-	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib'
 		  ;;
 	      esac
 
-	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
+	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir'
 	      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
 
 	      # Commands to make compiler produce verbose output that lists
@@ -6713,9 +7242,9 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    # Sun C++ 4.2, 5.x and Centerline C++
             _LT_TAGVAR(archive_cmds_need_lc,$1)=yes
 	    _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
 	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+              $CC -G$allow_undefined_flag $wl-M $wl$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
 	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
 	    _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
@@ -6723,7 +7252,7 @@ if test "$_lt_caught_CXX_error" != yes; then
 	      solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
 	      *)
 		# The compiler driver will combine and reorder linker options,
-		# but understands `-z linker_flag'.
+		# but understands '-z linker_flag'.
 	        # Supported since Solaris 2.6 (maybe 2.5.1?)
 		_LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
 	        ;;
@@ -6740,30 +7269,30 @@ if test "$_lt_caught_CXX_error" != yes; then
 	    ;;
           gcx*)
 	    # Green Hills C++ Compiler
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib'
 
 	    # The C++ compiler must be used to create the archive.
 	    _LT_TAGVAR(old_archive_cmds, $1)='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
 	    ;;
           *)
 	    # GNU C++ compiler with Solaris linker
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-z ${wl}defs'
+	    if test yes,no = "$GXX,$with_gnu_ld"; then
+	      _LT_TAGVAR(no_undefined_flag, $1)=' $wl-z ${wl}defs'
 	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib'
 	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -shared $pic_flag -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+                  $CC -shared $pic_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
 	        # Commands to make compiler produce verbose output that lists
 	        # what "hidden" libraries, object files and flags are used when
 	        # linking a shared library.
 	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
 	      else
-	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
+	        # g++ 2.7 appears to require '-G' NOT '-shared' on this
 	        # platform.
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
+	        _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib'
 	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
+                  $CC -G -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
 
 	        # Commands to make compiler produce verbose output that lists
 	        # what "hidden" libraries, object files and flags are used when
@@ -6771,11 +7300,11 @@ if test "$_lt_caught_CXX_error" != yes; then
 	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
 	      fi
 
-	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $wl$libdir'
+	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $wl$libdir'
 	      case $host_os in
 		solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
 		*)
-		  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
+		  _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl-z ${wl}allextract$convenience $wl-z ${wl}defaultextract'
 		  ;;
 	      esac
 	    fi
@@ -6784,52 +7313,52 @@ if test "$_lt_caught_CXX_error" != yes; then
         ;;
 
     sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
+      _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text'
       _LT_TAGVAR(archive_cmds_need_lc, $1)=no
       _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
       runpath_var='LD_RUN_PATH'
 
       case $cc_basename in
         CC*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
 	*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
 	  ;;
       esac
       ;;
 
       sysv5* | sco3.2v5* | sco5v6*)
-	# Note: We can NOT use -z defs as we might desire, because we do not
+	# Note: We CANNOT use -z defs as we might desire, because we do not
 	# link with -lc, and that would cause any symbols used from libc to
 	# always be unresolved, which means just about no library would
 	# ever link correctly.  If we're not using GNU ld we use -z text
 	# though, which does catch some bad symbols but isn't as heavy-handed
 	# as -z defs.
-	_LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-	_LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
+	_LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text'
+	_LT_TAGVAR(allow_undefined_flag, $1)='$wl-z,nodefs'
 	_LT_TAGVAR(archive_cmds_need_lc, $1)=no
 	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
+	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R,$libdir'
 	_LT_TAGVAR(hardcode_libdir_separator, $1)=':'
 	_LT_TAGVAR(link_all_deplibs, $1)=yes
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
+	_LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Bexport'
 	runpath_var='LD_RUN_PATH'
 
 	case $cc_basename in
           CC*)
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
 	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -Tprelink_objects $oldobjs~
-	      '"$_LT_TAGVAR(old_archive_cmds, $1)"
+              '"$_LT_TAGVAR(old_archive_cmds, $1)"
 	    _LT_TAGVAR(reload_cmds, $1)='$CC -Tprelink_objects $reload_objs~
-	      '"$_LT_TAGVAR(reload_cmds, $1)"
+              '"$_LT_TAGVAR(reload_cmds, $1)"
 	    ;;
 	  *)
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
+	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
 	    ;;
 	esac
       ;;
@@ -6860,10 +7389,10 @@ if test "$_lt_caught_CXX_error" != yes; then
     esac
 
     AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
-    test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
+    test no = "$_LT_TAGVAR(ld_shlibs, $1)" && can_build_shared=no
 
-    _LT_TAGVAR(GCC, $1)="$GXX"
-    _LT_TAGVAR(LD, $1)="$LD"
+    _LT_TAGVAR(GCC, $1)=$GXX
+    _LT_TAGVAR(LD, $1)=$LD
 
     ## CAVEAT EMPTOR:
     ## There is no encapsulation within the following macros, do not change
@@ -6890,7 +7419,7 @@ if test "$_lt_caught_CXX_error" != yes; then
   lt_cv_path_LD=$lt_save_path_LD
   lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
   lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
-fi # test "$_lt_caught_CXX_error" != yes
+fi # test yes != "$_lt_caught_CXX_error"
 
 AC_LANG_POP
 ])# _LT_LANG_CXX_CONFIG
@@ -6912,13 +7441,14 @@ AC_REQUIRE([_LT_DECL_SED])
 AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])
 func_stripname_cnf ()
 {
-  case ${2} in
-  .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
-  *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
+  case @S|@2 in
+  .*) func_stripname_result=`$ECHO "@S|@3" | $SED "s%^@S|@1%%; s%\\\\@S|@2\$%%"`;;
+  *)  func_stripname_result=`$ECHO "@S|@3" | $SED "s%^@S|@1%%; s%@S|@2\$%%"`;;
   esac
 } # func_stripname_cnf
 ])# _LT_FUNC_STRIPNAME_CNF
 
+
 # _LT_SYS_HIDDEN_LIBDEPS([TAGNAME])
 # ---------------------------------
 # Figure out "hidden" library dependencies from verbose
@@ -7002,13 +7532,13 @@ if AC_TRY_EVAL(ac_compile); then
   pre_test_object_deps_done=no
 
   for p in `eval "$output_verbose_link_cmd"`; do
-    case ${prev}${p} in
+    case $prev$p in
 
     -L* | -R* | -l*)
        # Some compilers place space between "-{L,R}" and the path.
        # Remove the space.
-       if test $p = "-L" ||
-          test $p = "-R"; then
+       if test x-L = "$p" ||
+          test x-R = "$p"; then
 	 prev=$p
 	 continue
        fi
@@ -7024,16 +7554,16 @@ if AC_TRY_EVAL(ac_compile); then
        case $p in
        =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;;
        esac
-       if test "$pre_test_object_deps_done" = no; then
-	 case ${prev} in
+       if test no = "$pre_test_object_deps_done"; then
+	 case $prev in
 	 -L | -R)
 	   # Internal compiler library paths should come after those
 	   # provided the user.  The postdeps already come after the
 	   # user supplied libs so there is no need to process them.
 	   if test -z "$_LT_TAGVAR(compiler_lib_search_path, $1)"; then
-	     _LT_TAGVAR(compiler_lib_search_path, $1)="${prev}${p}"
+	     _LT_TAGVAR(compiler_lib_search_path, $1)=$prev$p
 	   else
-	     _LT_TAGVAR(compiler_lib_search_path, $1)="${_LT_TAGVAR(compiler_lib_search_path, $1)} ${prev}${p}"
+	     _LT_TAGVAR(compiler_lib_search_path, $1)="${_LT_TAGVAR(compiler_lib_search_path, $1)} $prev$p"
 	   fi
 	   ;;
 	 # The "-l" case would never come before the object being
@@ -7041,9 +7571,9 @@ if AC_TRY_EVAL(ac_compile); then
 	 esac
        else
 	 if test -z "$_LT_TAGVAR(postdeps, $1)"; then
-	   _LT_TAGVAR(postdeps, $1)="${prev}${p}"
+	   _LT_TAGVAR(postdeps, $1)=$prev$p
 	 else
-	   _LT_TAGVAR(postdeps, $1)="${_LT_TAGVAR(postdeps, $1)} ${prev}${p}"
+	   _LT_TAGVAR(postdeps, $1)="${_LT_TAGVAR(postdeps, $1)} $prev$p"
 	 fi
        fi
        prev=
@@ -7058,15 +7588,15 @@ if AC_TRY_EVAL(ac_compile); then
 	 continue
        fi
 
-       if test "$pre_test_object_deps_done" = no; then
+       if test no = "$pre_test_object_deps_done"; then
 	 if test -z "$_LT_TAGVAR(predep_objects, $1)"; then
-	   _LT_TAGVAR(predep_objects, $1)="$p"
+	   _LT_TAGVAR(predep_objects, $1)=$p
 	 else
 	   _LT_TAGVAR(predep_objects, $1)="$_LT_TAGVAR(predep_objects, $1) $p"
 	 fi
        else
 	 if test -z "$_LT_TAGVAR(postdep_objects, $1)"; then
-	   _LT_TAGVAR(postdep_objects, $1)="$p"
+	   _LT_TAGVAR(postdep_objects, $1)=$p
 	 else
 	   _LT_TAGVAR(postdep_objects, $1)="$_LT_TAGVAR(postdep_objects, $1) $p"
 	 fi
@@ -7097,51 +7627,6 @@ interix[[3-9]]*)
   _LT_TAGVAR(postdep_objects,$1)=
   _LT_TAGVAR(postdeps,$1)=
   ;;
-
-linux*)
-  case `$CC -V 2>&1 | sed 5q` in
-  *Sun\ C*)
-    # Sun C++ 5.9
-
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    if test "$solaris_use_stlport4" != yes; then
-      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-
-solaris*)
-  case $cc_basename in
-  CC* | sunCC*)
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    # Adding this requires a known-good setup of shared libraries for
-    # Sun compiler versions before 5.6, else PIC objects from an old
-    # archive will be linked into the output, leading to subtle bugs.
-    if test "$solaris_use_stlport4" != yes; then
-      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
 esac
 ])
 
@@ -7150,7 +7635,7 @@ case " $_LT_TAGVAR(postdeps, $1) " in
 esac
  _LT_TAGVAR(compiler_lib_search_dirs, $1)=
 if test -n "${_LT_TAGVAR(compiler_lib_search_path, $1)}"; then
- _LT_TAGVAR(compiler_lib_search_dirs, $1)=`echo " ${_LT_TAGVAR(compiler_lib_search_path, $1)}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
+ _LT_TAGVAR(compiler_lib_search_dirs, $1)=`echo " ${_LT_TAGVAR(compiler_lib_search_path, $1)}" | $SED -e 's! -L! !g' -e 's!^ !!'`
 fi
 _LT_TAGDECL([], [compiler_lib_search_dirs], [1],
     [The directories searched by this compiler when creating a shared library])
@@ -7170,10 +7655,10 @@ _LT_TAGDECL([], [compiler_lib_search_path], [1],
 # --------------------------
 # Ensure that the configuration variables for a Fortran 77 compiler are
 # suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
+# to write the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_F77_CONFIG],
 [AC_LANG_PUSH(Fortran 77)
-if test -z "$F77" || test "X$F77" = "Xno"; then
+if test -z "$F77" || test no = "$F77"; then
   _lt_disable_F77=yes
 fi
 
@@ -7210,7 +7695,7 @@ _LT_TAGVAR(objext, $1)=$objext
 # the F77 compiler isn't working.  Some variables (like enable_shared)
 # are currently assumed to apply to all compilers on this platform,
 # and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_disable_F77" != yes; then
+if test yes != "$_lt_disable_F77"; then
   # Code to be used in simple compile tests
   lt_simple_compile_test_code="\
       subroutine t
@@ -7232,7 +7717,7 @@ if test "$_lt_disable_F77" != yes; then
   _LT_LINKER_BOILERPLATE
 
   # Allow CC to be a program name with arguments.
-  lt_save_CC="$CC"
+  lt_save_CC=$CC
   lt_save_GCC=$GCC
   lt_save_CFLAGS=$CFLAGS
   CC=${F77-"f77"}
@@ -7246,21 +7731,25 @@ if test "$_lt_disable_F77" != yes; then
     AC_MSG_RESULT([$can_build_shared])
 
     AC_MSG_CHECKING([whether to build shared libraries])
-    test "$can_build_shared" = "no" && enable_shared=no
+    test no = "$can_build_shared" && enable_shared=no
 
     # On AIX, shared libraries and static libraries use the same namespace, and
     # are all built from PIC.
     case $host_os in
       aix3*)
-        test "$enable_shared" = yes && enable_static=no
+        test yes = "$enable_shared" && enable_static=no
         if test -n "$RANLIB"; then
           archive_cmds="$archive_cmds~\$RANLIB \$lib"
           postinstall_cmds='$RANLIB $lib'
         fi
         ;;
       aix[[4-9]]*)
-	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-	  test "$enable_shared" = yes && enable_static=no
+	if test ia64 != "$host_cpu"; then
+	  case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in
+	  yes,aix,yes) ;;		# shared object as lib.so file only
+	  yes,svr4,*) ;;		# shared object as lib.so archive member only
+	  yes,*) enable_static=no ;;	# shared object in lib.a archive as well
+	  esac
 	fi
         ;;
     esac
@@ -7268,11 +7757,11 @@ if test "$_lt_disable_F77" != yes; then
 
     AC_MSG_CHECKING([whether to build static libraries])
     # Make sure either enable_shared or enable_static is yes.
-    test "$enable_shared" = yes || enable_static=yes
+    test yes = "$enable_shared" || enable_static=yes
     AC_MSG_RESULT([$enable_static])
 
-    _LT_TAGVAR(GCC, $1)="$G77"
-    _LT_TAGVAR(LD, $1)="$LD"
+    _LT_TAGVAR(GCC, $1)=$G77
+    _LT_TAGVAR(LD, $1)=$LD
 
     ## CAVEAT EMPTOR:
     ## There is no encapsulation within the following macros, do not change
@@ -7289,9 +7778,9 @@ if test "$_lt_disable_F77" != yes; then
   fi # test -n "$compiler"
 
   GCC=$lt_save_GCC
-  CC="$lt_save_CC"
-  CFLAGS="$lt_save_CFLAGS"
-fi # test "$_lt_disable_F77" != yes
+  CC=$lt_save_CC
+  CFLAGS=$lt_save_CFLAGS
+fi # test yes != "$_lt_disable_F77"
 
 AC_LANG_POP
 ])# _LT_LANG_F77_CONFIG
@@ -7301,11 +7790,11 @@ AC_LANG_POP
 # -------------------------
 # Ensure that the configuration variables for a Fortran compiler are
 # suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
+# to write the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_FC_CONFIG],
 [AC_LANG_PUSH(Fortran)
 
-if test -z "$FC" || test "X$FC" = "Xno"; then
+if test -z "$FC" || test no = "$FC"; then
   _lt_disable_FC=yes
 fi
 
@@ -7342,7 +7831,7 @@ _LT_TAGVAR(objext, $1)=$objext
 # the FC compiler isn't working.  Some variables (like enable_shared)
 # are currently assumed to apply to all compilers on this platform,
 # and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_disable_FC" != yes; then
+if test yes != "$_lt_disable_FC"; then
   # Code to be used in simple compile tests
   lt_simple_compile_test_code="\
       subroutine t
@@ -7364,7 +7853,7 @@ if test "$_lt_disable_FC" != yes; then
   _LT_LINKER_BOILERPLATE
 
   # Allow CC to be a program name with arguments.
-  lt_save_CC="$CC"
+  lt_save_CC=$CC
   lt_save_GCC=$GCC
   lt_save_CFLAGS=$CFLAGS
   CC=${FC-"f95"}
@@ -7380,21 +7869,25 @@ if test "$_lt_disable_FC" != yes; then
     AC_MSG_RESULT([$can_build_shared])
 
     AC_MSG_CHECKING([whether to build shared libraries])
-    test "$can_build_shared" = "no" && enable_shared=no
+    test no = "$can_build_shared" && enable_shared=no
 
     # On AIX, shared libraries and static libraries use the same namespace, and
     # are all built from PIC.
     case $host_os in
       aix3*)
-        test "$enable_shared" = yes && enable_static=no
+        test yes = "$enable_shared" && enable_static=no
         if test -n "$RANLIB"; then
           archive_cmds="$archive_cmds~\$RANLIB \$lib"
           postinstall_cmds='$RANLIB $lib'
         fi
         ;;
       aix[[4-9]]*)
-	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-	  test "$enable_shared" = yes && enable_static=no
+	if test ia64 != "$host_cpu"; then
+	  case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in
+	  yes,aix,yes) ;;		# shared object as lib.so file only
+	  yes,svr4,*) ;;		# shared object as lib.so archive member only
+	  yes,*) enable_static=no ;;	# shared object in lib.a archive as well
+	  esac
 	fi
         ;;
     esac
@@ -7402,11 +7895,11 @@ if test "$_lt_disable_FC" != yes; then
 
     AC_MSG_CHECKING([whether to build static libraries])
     # Make sure either enable_shared or enable_static is yes.
-    test "$enable_shared" = yes || enable_static=yes
+    test yes = "$enable_shared" || enable_static=yes
     AC_MSG_RESULT([$enable_static])
 
-    _LT_TAGVAR(GCC, $1)="$ac_cv_fc_compiler_gnu"
-    _LT_TAGVAR(LD, $1)="$LD"
+    _LT_TAGVAR(GCC, $1)=$ac_cv_fc_compiler_gnu
+    _LT_TAGVAR(LD, $1)=$LD
 
     ## CAVEAT EMPTOR:
     ## There is no encapsulation within the following macros, do not change
@@ -7426,7 +7919,7 @@ if test "$_lt_disable_FC" != yes; then
   GCC=$lt_save_GCC
   CC=$lt_save_CC
   CFLAGS=$lt_save_CFLAGS
-fi # test "$_lt_disable_FC" != yes
+fi # test yes != "$_lt_disable_FC"
 
 AC_LANG_POP
 ])# _LT_LANG_FC_CONFIG
@@ -7436,7 +7929,7 @@ AC_LANG_POP
 # --------------------------
 # Ensure that the configuration variables for the GNU Java Compiler compiler
 # are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
+# to write the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_GCJ_CONFIG],
 [AC_REQUIRE([LT_PROG_GCJ])dnl
 AC_LANG_SAVE
@@ -7470,7 +7963,7 @@ CC=${GCJ-"gcj"}
 CFLAGS=$GCJFLAGS
 compiler=$CC
 _LT_TAGVAR(compiler, $1)=$CC
-_LT_TAGVAR(LD, $1)="$LD"
+_LT_TAGVAR(LD, $1)=$LD
 _LT_CC_BASENAME([$compiler])
 
 # GCJ did not exist at the time GCC didn't implicitly link libc in.
@@ -7507,7 +8000,7 @@ CFLAGS=$lt_save_CFLAGS
 # --------------------------
 # Ensure that the configuration variables for the GNU Go compiler
 # are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
+# to write the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_GO_CONFIG],
 [AC_REQUIRE([LT_PROG_GO])dnl
 AC_LANG_SAVE
@@ -7541,7 +8034,7 @@ CC=${GOC-"gccgo"}
 CFLAGS=$GOFLAGS
 compiler=$CC
 _LT_TAGVAR(compiler, $1)=$CC
-_LT_TAGVAR(LD, $1)="$LD"
+_LT_TAGVAR(LD, $1)=$LD
 _LT_CC_BASENAME([$compiler])
 
 # Go did not exist at the time GCC didn't implicitly link libc in.
@@ -7578,7 +8071,7 @@ CFLAGS=$lt_save_CFLAGS
 # -------------------------
 # Ensure that the configuration variables for the Windows resource compiler
 # are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
+# to write the compiler configuration to 'libtool'.
 m4_defun([_LT_LANG_RC_CONFIG],
 [AC_REQUIRE([LT_PROG_RC])dnl
 AC_LANG_SAVE
@@ -7594,7 +8087,7 @@ _LT_TAGVAR(objext, $1)=$objext
 lt_simple_compile_test_code='sample MENU { MENUITEM "&Soup", 100, CHECKED }'
 
 # Code to be used in simple link tests
-lt_simple_link_test_code="$lt_simple_compile_test_code"
+lt_simple_link_test_code=$lt_simple_compile_test_code
 
 # ltmain only uses $CC for tagged configurations so make sure $CC is set.
 _LT_TAG_COMPILER
@@ -7604,7 +8097,7 @@ _LT_COMPILER_BOILERPLATE
 _LT_LINKER_BOILERPLATE
 
 # Allow CC to be a program name with arguments.
-lt_save_CC="$CC"
+lt_save_CC=$CC
 lt_save_CFLAGS=$CFLAGS
 lt_save_GCC=$GCC
 GCC=
@@ -7633,7 +8126,7 @@ AC_DEFUN([LT_PROG_GCJ],
 [m4_ifdef([AC_PROG_GCJ], [AC_PROG_GCJ],
   [m4_ifdef([A][M_PROG_GCJ], [A][M_PROG_GCJ],
     [AC_CHECK_TOOL(GCJ, gcj,)
-      test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
+      test set = "${GCJFLAGS+set}" || GCJFLAGS="-g -O2"
       AC_SUBST(GCJFLAGS)])])[]dnl
 ])
 
@@ -7744,7 +8237,7 @@ lt_ac_count=0
 # Add /usr/xpg4/bin/sed as it is typically found on Solaris
 # along with /bin/sed that truncates output.
 for lt_ac_sed in $lt_ac_sed_list /usr/xpg4/bin/sed; do
-  test ! -f $lt_ac_sed && continue
+  test ! -f "$lt_ac_sed" && continue
   cat /dev/null > conftest.in
   lt_ac_count=0
   echo $ECHO_N "0123456789$ECHO_C" >conftest.in
@@ -7761,9 +8254,9 @@ for lt_ac_sed in $lt_ac_sed_list /usr/xpg4/bin/sed; do
     $lt_ac_sed -e 's/a$//' < conftest.nl >conftest.out || break
     cmp -s conftest.out conftest.nl || break
     # 10000 chars as input seems more than enough
-    test $lt_ac_count -gt 10 && break
+    test 10 -lt "$lt_ac_count" && break
     lt_ac_count=`expr $lt_ac_count + 1`
-    if test $lt_ac_count -gt $lt_ac_max; then
+    if test "$lt_ac_count" -gt "$lt_ac_max"; then
       lt_ac_max=$lt_ac_count
       lt_cv_path_SED=$lt_ac_sed
     fi
@@ -7787,27 +8280,7 @@ dnl AC_DEFUN([LT_AC_PROG_SED], [])
 # Find out whether the shell is Bourne or XSI compatible,
 # or has some other useful features.
 m4_defun([_LT_CHECK_SHELL_FEATURES],
-[AC_MSG_CHECKING([whether the shell understands some XSI constructs])
-# Try some XSI features
-xsi_shell=no
-( _lt_dummy="a/b/c"
-  test "${_lt_dummy##*/},${_lt_dummy%/*},${_lt_dummy#??}"${_lt_dummy%"$_lt_dummy"}, \
-      = c,a/b,b/c, \
-    && eval 'test $(( 1 + 1 )) -eq 2 \
-    && test "${#_lt_dummy}" -eq 5' ) >/dev/null 2>&1 \
-  && xsi_shell=yes
-AC_MSG_RESULT([$xsi_shell])
-_LT_CONFIG_LIBTOOL_INIT([xsi_shell='$xsi_shell'])
-
-AC_MSG_CHECKING([whether the shell understands "+="])
-lt_shell_append=no
-( foo=bar; set foo baz; eval "$[1]+=\$[2]" && test "$foo" = barbaz ) \
-    >/dev/null 2>&1 \
-  && lt_shell_append=yes
-AC_MSG_RESULT([$lt_shell_append])
-_LT_CONFIG_LIBTOOL_INIT([lt_shell_append='$lt_shell_append'])
-
-if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+[if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
   lt_unset=unset
 else
   lt_unset=false
@@ -7831,102 +8304,9 @@ _LT_DECL([NL2SP], [lt_NL2SP], [1], [turn newlines into spaces])dnl
 ])# _LT_CHECK_SHELL_FEATURES
 
 
-# _LT_PROG_FUNCTION_REPLACE (FUNCNAME, REPLACEMENT-BODY)
-# ------------------------------------------------------
-# In `$cfgfile', look for function FUNCNAME delimited by `^FUNCNAME ()$' and
-# '^} FUNCNAME ', and replace its body with REPLACEMENT-BODY.
-m4_defun([_LT_PROG_FUNCTION_REPLACE],
-[dnl {
-sed -e '/^$1 ()$/,/^} # $1 /c\
-$1 ()\
-{\
-m4_bpatsubsts([$2], [$], [\\], [^\([	 ]\)], [\\\1])
-} # Extended-shell $1 implementation' "$cfgfile" > $cfgfile.tmp \
-  && mv -f "$cfgfile.tmp" "$cfgfile" \
-    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-test 0 -eq $? || _lt_function_replace_fail=:
-])
-
-
-# _LT_PROG_REPLACE_SHELLFNS
-# -------------------------
-# Replace existing portable implementations of several shell functions with
-# equivalent extended shell implementations where those features are available..
-m4_defun([_LT_PROG_REPLACE_SHELLFNS],
-[if test x"$xsi_shell" = xyes; then
-  _LT_PROG_FUNCTION_REPLACE([func_dirname], [dnl
-    case ${1} in
-      */*) func_dirname_result="${1%/*}${2}" ;;
-      *  ) func_dirname_result="${3}" ;;
-    esac])
-
-  _LT_PROG_FUNCTION_REPLACE([func_basename], [dnl
-    func_basename_result="${1##*/}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_dirname_and_basename], [dnl
-    case ${1} in
-      */*) func_dirname_result="${1%/*}${2}" ;;
-      *  ) func_dirname_result="${3}" ;;
-    esac
-    func_basename_result="${1##*/}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_stripname], [dnl
-    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are
-    # positional parameters, so assign one to ordinary parameter first.
-    func_stripname_result=${3}
-    func_stripname_result=${func_stripname_result#"${1}"}
-    func_stripname_result=${func_stripname_result%"${2}"}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_split_long_opt], [dnl
-    func_split_long_opt_name=${1%%=*}
-    func_split_long_opt_arg=${1#*=}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_split_short_opt], [dnl
-    func_split_short_opt_arg=${1#??}
-    func_split_short_opt_name=${1%"$func_split_short_opt_arg"}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_lo2o], [dnl
-    case ${1} in
-      *.lo) func_lo2o_result=${1%.lo}.${objext} ;;
-      *)    func_lo2o_result=${1} ;;
-    esac])
-
-  _LT_PROG_FUNCTION_REPLACE([func_xform], [    func_xform_result=${1%.*}.lo])
-
-  _LT_PROG_FUNCTION_REPLACE([func_arith], [    func_arith_result=$(( $[*] ))])
-
-  _LT_PROG_FUNCTION_REPLACE([func_len], [    func_len_result=${#1}])
-fi
-
-if test x"$lt_shell_append" = xyes; then
-  _LT_PROG_FUNCTION_REPLACE([func_append], [    eval "${1}+=\\${2}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_append_quoted], [dnl
-    func_quote_for_eval "${2}"
-dnl m4 expansion turns \\\\ into \\, and then the shell eval turns that into \
-    eval "${1}+=\\\\ \\$func_quote_for_eval_result"])
-
-  # Save a `func_append' function call where possible by direct use of '+='
-  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1+="%g' $cfgfile > $cfgfile.tmp \
-    && mv -f "$cfgfile.tmp" "$cfgfile" \
-      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-  test 0 -eq $? || _lt_function_replace_fail=:
-else
-  # Save a `func_append' function call even when '+=' is not available
-  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1="$\1%g' $cfgfile > $cfgfile.tmp \
-    && mv -f "$cfgfile.tmp" "$cfgfile" \
-      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-  test 0 -eq $? || _lt_function_replace_fail=:
-fi
-
-if test x"$_lt_function_replace_fail" = x":"; then
-  AC_MSG_WARN([Unable to substitute extended shell functions in $ofile])
-fi
-])
-
 # _LT_PATH_CONVERSION_FUNCTIONS
 # -----------------------------
-# Determine which file name conversion functions should be used by
+# Determine what file name conversion functions should be used by
 # func_to_host_file (and, implicitly, by func_to_host_path).  These are needed
 # for certain cross-compile configurations and native mingw.
 m4_defun([_LT_PATH_CONVERSION_FUNCTIONS],
diff --git a/m4/ltoptions.m4 b/m4/ltoptions.m4
index 5d9acd8e23bcfd20d353804aff13666ecbed54f4..94b082976667c0c9edf2e9ff7a668c1fd7d997bf 100644
--- a/m4/ltoptions.m4
+++ b/m4/ltoptions.m4
@@ -1,14 +1,14 @@
 # Helper functions for option handling.                    -*- Autoconf -*-
 #
-#   Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation,
-#   Inc.
+#   Copyright (C) 2004-2005, 2007-2009, 2011-2015 Free Software
+#   Foundation, Inc.
 #   Written by Gary V. Vaughan, 2004
 #
 # This file is free software; the Free Software Foundation gives
 # unlimited permission to copy and/or distribute it, with or without
 # modifications, as long as this notice is preserved.
 
-# serial 7 ltoptions.m4
+# serial 8 ltoptions.m4
 
 # This is to help aclocal find these macros, as it can't see m4_define.
 AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])])
@@ -29,7 +29,7 @@ m4_define([_LT_SET_OPTION],
 [m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl
 m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]),
         _LT_MANGLE_DEFUN([$1], [$2]),
-    [m4_warning([Unknown $1 option `$2'])])[]dnl
+    [m4_warning([Unknown $1 option '$2'])])[]dnl
 ])
 
 
@@ -75,13 +75,15 @@ m4_if([$1],[LT_INIT],[
   dnl
   dnl If no reference was made to various pairs of opposing options, then
   dnl we run the default mode handler for the pair.  For example, if neither
-  dnl `shared' nor `disable-shared' was passed, we enable building of shared
+  dnl 'shared' nor 'disable-shared' was passed, we enable building of shared
   dnl archives by default:
   _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED])
   _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC])
   _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC])
   _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install],
-  		   [_LT_ENABLE_FAST_INSTALL])
+		   [_LT_ENABLE_FAST_INSTALL])
+  _LT_UNLESS_OPTIONS([LT_INIT], [aix-soname=aix aix-soname=both aix-soname=svr4],
+		   [_LT_WITH_AIX_SONAME([aix])])
   ])
 ])# _LT_SET_OPTIONS
 
@@ -112,7 +114,7 @@ AU_DEFUN([AC_LIBTOOL_DLOPEN],
 [_LT_SET_OPTION([LT_INIT], [dlopen])
 AC_DIAGNOSE([obsolete],
 [$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `dlopen' option into LT_INIT's first parameter.])
+put the 'dlopen' option into LT_INIT's first parameter.])
 ])
 
 dnl aclocal-1.4 backwards compatibility:
@@ -148,7 +150,7 @@ AU_DEFUN([AC_LIBTOOL_WIN32_DLL],
 _LT_SET_OPTION([LT_INIT], [win32-dll])
 AC_DIAGNOSE([obsolete],
 [$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `win32-dll' option into LT_INIT's first parameter.])
+put the 'win32-dll' option into LT_INIT's first parameter.])
 ])
 
 dnl aclocal-1.4 backwards compatibility:
@@ -157,9 +159,9 @@ dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [])
 
 # _LT_ENABLE_SHARED([DEFAULT])
 # ----------------------------
-# implement the --enable-shared flag, and supports the `shared' and
-# `disable-shared' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+# implement the --enable-shared flag, and supports the 'shared' and
+# 'disable-shared' LT_INIT options.
+# DEFAULT is either 'yes' or 'no'.  If omitted, it defaults to 'yes'.
 m4_define([_LT_ENABLE_SHARED],
 [m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl
 AC_ARG_ENABLE([shared],
@@ -172,14 +174,14 @@ AC_ARG_ENABLE([shared],
     *)
       enable_shared=no
       # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
       for pkg in $enableval; do
-	IFS="$lt_save_ifs"
+	IFS=$lt_save_ifs
 	if test "X$pkg" = "X$p"; then
 	  enable_shared=yes
 	fi
       done
-      IFS="$lt_save_ifs"
+      IFS=$lt_save_ifs
       ;;
     esac],
     [enable_shared=]_LT_ENABLE_SHARED_DEFAULT)
@@ -211,9 +213,9 @@ dnl AC_DEFUN([AM_DISABLE_SHARED], [])
 
 # _LT_ENABLE_STATIC([DEFAULT])
 # ----------------------------
-# implement the --enable-static flag, and support the `static' and
-# `disable-static' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+# implement the --enable-static flag, and support the 'static' and
+# 'disable-static' LT_INIT options.
+# DEFAULT is either 'yes' or 'no'.  If omitted, it defaults to 'yes'.
 m4_define([_LT_ENABLE_STATIC],
 [m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl
 AC_ARG_ENABLE([static],
@@ -226,14 +228,14 @@ AC_ARG_ENABLE([static],
     *)
      enable_static=no
       # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
       for pkg in $enableval; do
-	IFS="$lt_save_ifs"
+	IFS=$lt_save_ifs
 	if test "X$pkg" = "X$p"; then
 	  enable_static=yes
 	fi
       done
-      IFS="$lt_save_ifs"
+      IFS=$lt_save_ifs
       ;;
     esac],
     [enable_static=]_LT_ENABLE_STATIC_DEFAULT)
@@ -265,9 +267,9 @@ dnl AC_DEFUN([AM_DISABLE_STATIC], [])
 
 # _LT_ENABLE_FAST_INSTALL([DEFAULT])
 # ----------------------------------
-# implement the --enable-fast-install flag, and support the `fast-install'
-# and `disable-fast-install' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
+# implement the --enable-fast-install flag, and support the 'fast-install'
+# and 'disable-fast-install' LT_INIT options.
+# DEFAULT is either 'yes' or 'no'.  If omitted, it defaults to 'yes'.
 m4_define([_LT_ENABLE_FAST_INSTALL],
 [m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl
 AC_ARG_ENABLE([fast-install],
@@ -280,14 +282,14 @@ AC_ARG_ENABLE([fast-install],
     *)
       enable_fast_install=no
       # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
       for pkg in $enableval; do
-	IFS="$lt_save_ifs"
+	IFS=$lt_save_ifs
 	if test "X$pkg" = "X$p"; then
 	  enable_fast_install=yes
 	fi
       done
-      IFS="$lt_save_ifs"
+      IFS=$lt_save_ifs
       ;;
     esac],
     [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT)
@@ -304,14 +306,14 @@ AU_DEFUN([AC_ENABLE_FAST_INSTALL],
 [_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install])
 AC_DIAGNOSE([obsolete],
 [$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `fast-install' option into LT_INIT's first parameter.])
+the 'fast-install' option into LT_INIT's first parameter.])
 ])
 
 AU_DEFUN([AC_DISABLE_FAST_INSTALL],
 [_LT_SET_OPTION([LT_INIT], [disable-fast-install])
 AC_DIAGNOSE([obsolete],
 [$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `disable-fast-install' option into LT_INIT's first parameter.])
+the 'disable-fast-install' option into LT_INIT's first parameter.])
 ])
 
 dnl aclocal-1.4 backwards compatibility:
@@ -319,11 +321,64 @@ dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], [])
 dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], [])
 
 
+# _LT_WITH_AIX_SONAME([DEFAULT])
+# ----------------------------------
+# implement the --with-aix-soname flag, and support the `aix-soname=aix'
+# and `aix-soname=both' and `aix-soname=svr4' LT_INIT options. DEFAULT
+# is either `aix', `both' or `svr4'.  If omitted, it defaults to `aix'.
+m4_define([_LT_WITH_AIX_SONAME],
+[m4_define([_LT_WITH_AIX_SONAME_DEFAULT], [m4_if($1, svr4, svr4, m4_if($1, both, both, aix))])dnl
+shared_archive_member_spec=
+case $host,$enable_shared in
+power*-*-aix[[5-9]]*,yes)
+  AC_MSG_CHECKING([which variant of shared library versioning to provide])
+  AC_ARG_WITH([aix-soname],
+    [AS_HELP_STRING([--with-aix-soname=aix|svr4|both],
+      [shared library versioning (aka "SONAME") variant to provide on AIX, @<:@default=]_LT_WITH_AIX_SONAME_DEFAULT[@:>@.])],
+    [case $withval in
+    aix|svr4|both)
+      ;;
+    *)
+      AC_MSG_ERROR([Unknown argument to --with-aix-soname])
+      ;;
+    esac
+    lt_cv_with_aix_soname=$with_aix_soname],
+    [AC_CACHE_VAL([lt_cv_with_aix_soname],
+      [lt_cv_with_aix_soname=]_LT_WITH_AIX_SONAME_DEFAULT)
+    with_aix_soname=$lt_cv_with_aix_soname])
+  AC_MSG_RESULT([$with_aix_soname])
+  if test aix != "$with_aix_soname"; then
+    # For the AIX way of multilib, we name the shared archive member
+    # based on the bitwidth used, traditionally 'shr.o' or 'shr_64.o',
+    # and 'shr.imp' or 'shr_64.imp', respectively, for the Import File.
+    # Even when GNU compilers ignore OBJECT_MODE but need '-maix64' flag,
+    # the AIX toolchain works better with OBJECT_MODE set (default 32).
+    if test 64 = "${OBJECT_MODE-32}"; then
+      shared_archive_member_spec=shr_64
+    else
+      shared_archive_member_spec=shr
+    fi
+  fi
+  ;;
+*)
+  with_aix_soname=aix
+  ;;
+esac
+
+_LT_DECL([], [shared_archive_member_spec], [0],
+    [Shared archive member basename, for filename based shared library versioning on AIX])dnl
+])# _LT_WITH_AIX_SONAME
+
+LT_OPTION_DEFINE([LT_INIT], [aix-soname=aix], [_LT_WITH_AIX_SONAME([aix])])
+LT_OPTION_DEFINE([LT_INIT], [aix-soname=both], [_LT_WITH_AIX_SONAME([both])])
+LT_OPTION_DEFINE([LT_INIT], [aix-soname=svr4], [_LT_WITH_AIX_SONAME([svr4])])
+
+
 # _LT_WITH_PIC([MODE])
 # --------------------
-# implement the --with-pic flag, and support the `pic-only' and `no-pic'
+# implement the --with-pic flag, and support the 'pic-only' and 'no-pic'
 # LT_INIT options.
-# MODE is either `yes' or `no'.  If omitted, it defaults to `both'.
+# MODE is either 'yes' or 'no'.  If omitted, it defaults to 'both'.
 m4_define([_LT_WITH_PIC],
 [AC_ARG_WITH([pic],
     [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@],
@@ -334,19 +389,17 @@ m4_define([_LT_WITH_PIC],
     *)
       pic_mode=default
       # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
+      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
       for lt_pkg in $withval; do
-	IFS="$lt_save_ifs"
+	IFS=$lt_save_ifs
 	if test "X$lt_pkg" = "X$lt_p"; then
 	  pic_mode=yes
 	fi
       done
-      IFS="$lt_save_ifs"
+      IFS=$lt_save_ifs
       ;;
     esac],
-    [pic_mode=default])
-
-test -z "$pic_mode" && pic_mode=m4_default([$1], [default])
+    [pic_mode=m4_default([$1], [default])])
 
 _LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl
 ])# _LT_WITH_PIC
@@ -359,7 +412,7 @@ AU_DEFUN([AC_LIBTOOL_PICMODE],
 [_LT_SET_OPTION([LT_INIT], [pic-only])
 AC_DIAGNOSE([obsolete],
 [$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `pic-only' option into LT_INIT's first parameter.])
+put the 'pic-only' option into LT_INIT's first parameter.])
 ])
 
 dnl aclocal-1.4 backwards compatibility:
diff --git a/m4/ltsugar.m4 b/m4/ltsugar.m4
index 9000a057d31ddf75cb85ccda8757de4493bcdbe7..48bc9344a4d661e116be1483d5520753875b2bd1 100644
--- a/m4/ltsugar.m4
+++ b/m4/ltsugar.m4
@@ -1,6 +1,7 @@
 # ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
 #
-# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+# Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software
+# Foundation, Inc.
 # Written by Gary V. Vaughan, 2004
 #
 # This file is free software; the Free Software Foundation gives
@@ -33,7 +34,7 @@ m4_define([_lt_join],
 # ------------
 # Manipulate m4 lists.
 # These macros are necessary as long as will still need to support
-# Autoconf-2.59 which quotes differently.
+# Autoconf-2.59, which quotes differently.
 m4_define([lt_car], [[$1]])
 m4_define([lt_cdr],
 [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
@@ -44,7 +45,7 @@ m4_define([lt_unquote], $1)
 
 # lt_append(MACRO-NAME, STRING, [SEPARATOR])
 # ------------------------------------------
-# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
+# Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'.
 # Note that neither SEPARATOR nor STRING are expanded; they are appended
 # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
 # No SEPARATOR is output if MACRO-NAME was previously undefined (different
diff --git a/m4/ltversion.m4 b/m4/ltversion.m4
index 07a8602d48d615a65800b14446d8c8c8694f2818..fa04b52a3bf868bd57e7ba796d64385a51e7ec5a 100644
--- a/m4/ltversion.m4
+++ b/m4/ltversion.m4
@@ -1,6 +1,6 @@
 # ltversion.m4 -- version numbers			-*- Autoconf -*-
 #
-#   Copyright (C) 2004 Free Software Foundation, Inc.
+#   Copyright (C) 2004, 2011-2015 Free Software Foundation, Inc.
 #   Written by Scott James Remnant, 2004
 #
 # This file is free software; the Free Software Foundation gives
@@ -9,15 +9,15 @@
 
 # @configure_input@
 
-# serial 3337 ltversion.m4
+# serial 4179 ltversion.m4
 # This file is part of GNU Libtool
 
-m4_define([LT_PACKAGE_VERSION], [2.4.2])
-m4_define([LT_PACKAGE_REVISION], [1.3337])
+m4_define([LT_PACKAGE_VERSION], [2.4.6])
+m4_define([LT_PACKAGE_REVISION], [2.4.6])
 
 AC_DEFUN([LTVERSION_VERSION],
-[macro_version='2.4.2'
-macro_revision='1.3337'
+[macro_version='2.4.6'
+macro_revision='2.4.6'
 _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
 _LT_DECL(, macro_revision, 0)
 ])
diff --git a/m4/lt~obsolete.m4 b/m4/lt~obsolete.m4
index c573da90c5ccebffba4dce9a6462036bfa986d5f..c6b26f88f6c3c1a052afa6314ba2adf832d785ee 100644
--- a/m4/lt~obsolete.m4
+++ b/m4/lt~obsolete.m4
@@ -1,6 +1,7 @@
 # lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
 #
-#   Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
+#   Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software
+#   Foundation, Inc.
 #   Written by Scott James Remnant, 2004.
 #
 # This file is free software; the Free Software Foundation gives
@@ -11,7 +12,7 @@
 
 # These exist entirely to fool aclocal when bootstrapping libtool.
 #
-# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
+# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN),
 # which have later been changed to m4_define as they aren't part of the
 # exported API, or moved to Autoconf or Automake where they belong.
 #
@@ -25,7 +26,7 @@
 # included after everything else.  This provides aclocal with the
 # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
 # because those macros already exist, or will be overwritten later.
-# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 
+# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
 #
 # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
 # Yes, that means every name once taken will need to remain here until
diff --git a/sPuReMD/Makefile.am b/sPuReMD/Makefile.am
index 7c986471c7759f44ea3cd9aea126c246929f1647..a31309c6433e65d994c92b0543dad51e0172e501 100644
--- a/sPuReMD/Makefile.am
+++ b/sPuReMD/Makefile.am
@@ -1,10 +1,11 @@
 ACLOCAL_AMFLAGS = -I ../m4
 
 bin_PROGRAMS = bin/spuremd
+
 bin_spuremd_SOURCES = src/ffield.c src/grid.c src/list.c src/lookup.c src/print_utils.c \
 		  src/reset_utils.c src/restart.c src/random.c src/tool_box.c src/traj.c \
 		  src/vector.c src/allocate.c src/analyze.c src/box.c src/system_props.c src/control.c \
-		  src/geo_tools.c src/neighbors.c src/lin_alg.c src/QEq.c src/bond_orders.c \
+		  src/geo_tools.c src/neighbors.c src/lin_alg.c src/charges.c src/bond_orders.c \
 		  src/single_body_interactions.c src/two_body_interactions.c \
 		  src/three_body_interactions.c src/four_body_interactions.c src/forces.c \
 		  src/integrate.c src/init_md.c src/testmd.c 
@@ -12,7 +13,21 @@ bin_spuremd_SOURCES = src/ffield.c src/grid.c src/list.c src/lookup.c src/print_
 include_HEADERS = src/mytypes.h src/ffield.h src/grid.h src/list.h src/lookup.h src/print_utils.h \
 		  src/reset_utils.h src/restart.h src/random.h src/tool_box.h src/traj.h \
 		  src/vector.h src/allocate.h src/analyze.h src/box.h src/system_props.h src/control.h \
-		  src/geo_tools.h src/neighbors.h src/lin_alg.h src/QEq.h src/bond_orders.h \
+		  src/geo_tools.h src/neighbors.h src/lin_alg.h src/charges.h src/bond_orders.h \
 		  src/single_body_interactions.h src/two_body_interactions.h \
 		  src/three_body_interactions.h src/four_body_interactions.h src/forces.h \
 		  src/integrate.h src/init_md.h
+
+check_PROGRAMS =
+TESTS =
+
+if BUILD_TEST
+check_PROGRAMS += tests/test_vector
+TESTS += $(check_PROGRAMS)
+
+tests_test_vector_SOURCES = tests/test_vector.cpp
+tests_test_vector_CPPFLAGS = -Isrc $(GTEST_CPPFLAGS)
+tests_test_vector_CXXFLAGS = $(GTEST_CXXFLAGS)
+tests_test_vector_LDFLAGS = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+tests_test_vector_LDADD = -lgtest
+endif
diff --git a/sPuReMD/aclocal.m4 b/sPuReMD/aclocal.m4
index f23d4faf7e046b186869ab73a7a8e044765e1d83..48e9bea0b26ee47b32d9eb55e5270e1f45964620 100644
--- a/sPuReMD/aclocal.m4
+++ b/sPuReMD/aclocal.m4
@@ -1150,4 +1150,5 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
+m4_include([../m4/acx_pthread.m4])
 m4_include([../m4/ax_compiler_vendor.m4])
diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac
index 578c00d4de6ee81ee70318e79c0dc266129d2b19..312fa804e8cf375a499f48218f0a1532474e4ec2 100644
--- a/sPuReMD/configure.ac
+++ b/sPuReMD/configure.ac
@@ -60,7 +60,7 @@ if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
 fi
 
 # Check for OpenMP support.
-if test "x$BUILD_OPENMP" = "xyes"; then
+if test "x${BUILD_OPENMP}" = "xyes"; then
 	AC_OPENMP
 	if test "x${OPENMP_CFLAGS}" = "x"; then
 		AC_MSG_WARN([
@@ -70,7 +70,7 @@ if test "x$BUILD_OPENMP" = "xyes"; then
 	  -----------------------------------------------])
 	else
 		# bug due to recent Intel compiler change (?)
-		if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
+		if test "x${ax_cv_c_compiler_vendor}" = "xintel"; then
 			OPENMP_CFLAGS="-qopenmp"
 		fi
 		AC_SUBST(AM_CFLAGS, "$OPENMP_CFLAGS")
@@ -129,6 +129,46 @@ then
 	CFLAGS="${CFLAGS} ${GPROF_FLAGS}"
 fi
 
+# Tests using Google C++ testing framework (gtest)
+AC_LANG_PUSH([C++])
+AC_PROG_CXX([icpc g++ clang++ CC])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+  [[#ifndef __cplusplus
+    #error "broken C++"
+    #endif]])],,
+  [CXX=;])
+if test "x${CXX}" != "x"
+then
+	AC_CHECK_HEADER([gtest/gtest.h], [HAVE_GTEST_H="yes"])
+	if test "x${HAVE_GTEST_H}" != "xyes"
+	then
+		AC_MSG_WARN([gtest.h not found. Unable to build tests. Continuing...])
+	else
+		ACX_PTHREAD([have_pthread=yes])
+
+		# define gtest variables 
+		if test "x${have_pthread}" = "xyes"; then
+			GTEST_CPPFLAGS="-DGTEST_HAS_PTHREAD=1"
+			GTEST_CXXFLAGS="$PTHREAD_CFLAGS"
+			GTEST_LDFLAGS=
+			GTEST_LIBS="$PTHREAD_LIBS"
+		else
+			GTEST_CPPFLAGS="-DGTEST_HAS_PTHREAD=0"
+			GTEST_CXXFLAGS=
+			GTEST_LDFLAGS=
+			GTEST_LIBS=
+		fi
+		AC_SUBST([GTEST_CPPFLAGS])
+		AC_SUBST([GTEST_CXXFLAGS])
+		AC_SUBST([GTEST_LDFLAGS])
+		AC_SUBST([GTEST_LIBS])
+	fi
+else
+	AC_MSG_WARN([C++ compiler not found. Unable to build tests. Continuing...])
+fi
+AM_CONDITIONAL([BUILD_TEST], [test "x${HAVE_GTEST_H}" = "xyes"])
+AC_LANG_POP([C++])
+
 AC_CONFIG_FILES([Makefile])
 
 AC_OUTPUT
diff --git a/sPuReMD/src/allocate.c b/sPuReMD/src/allocate.c
index c9c5321c0e7a18cec76584e97ee02509021a3671..6da598bd527a670cc14d8dcf21a3f7a842c6eea0 100644
--- a/sPuReMD/src/allocate.c
+++ b/sPuReMD/src/allocate.c
@@ -56,7 +56,7 @@ int PreAllocate_Space( reax_system *system, control_params *control,
 
 void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
-    Delete_List( far_nbrs );
+    Delete_List( TYP_FAR_NEIGHBOR, far_nbrs );
 
     if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
     {
@@ -73,6 +73,7 @@ void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 }
 
 
+/* dynamic allocation of memory for matrix in CSR format */
 int Allocate_Matrix( sparse_matrix **pH, int n, int m )
 {
     sparse_matrix *H;
@@ -86,8 +87,8 @@ int Allocate_Matrix( sparse_matrix **pH, int n, int m )
     H->n = n;
     H->m = m;
 
-    if ( (H->start = (unsigned int*) malloc(sizeof(int) * (n + 1))) == NULL
-            || (H->j = (unsigned int*) malloc(sizeof(int) * m)) == NULL
+    if ( (H->start = (unsigned int*) malloc(sizeof(unsigned int) * (n + 1))) == NULL
+            || (H->j = (unsigned int*) malloc(sizeof(unsigned int) * m)) == NULL
             || (H->val = (real*) malloc(sizeof(real) * m)) == NULL )
     {
         return FAILURE;
@@ -97,6 +98,7 @@ int Allocate_Matrix( sparse_matrix **pH, int n, int m )
 }
 
 
+/* deallocate memory for matrix in CSR format */
 void Deallocate_Matrix( sparse_matrix *H )
 {
     free(H->start);
@@ -186,7 +188,7 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
         }
     }
 
-    Delete_List( hbonds );
+    Delete_List( TYP_HBOND, hbonds );
 
     Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
 
@@ -247,7 +249,7 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
         bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
     }
 
-    Delete_List( bonds );
+    Delete_List( TYP_BOND, bonds );
 
     Allocate_Bond_List( n, bond_top, bonds );
     *num_bonds = bond_top[n - 1];
@@ -278,7 +280,7 @@ void Reallocate( reax_system *system, static_storage *workspace, list **lists,
 
     if ( realloc->Htop > 0 )
     {
-        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop * SAFE_ZONE, "H");
+        Reallocate_Matrix(&(workspace->H), system->N_cm, realloc->Htop * SAFE_ZONE, "H");
         realloc->Htop = -1;
 
         Deallocate_Matrix( workspace->L );
@@ -304,7 +306,7 @@ void Reallocate( reax_system *system, static_storage *workspace, list **lists,
 
     if ( realloc->num_3body > 0 )
     {
-        Delete_List( (*lists) + THREE_BODIES );
+        Delete_List( TYP_THREE_BODY, (*lists) + THREE_BODIES );
 
         if ( num_bonds == -1 )
             num_bonds = ((*lists) + BONDS)->num_intrs;
diff --git a/sPuReMD/src/analyze.c b/sPuReMD/src/analyze.c
index b9f5c33a91ead4e21fd4a7c1d7365b6063c45ca8..e78f533d5743d0dd398d5131b442ae9ad4afe879 100644
--- a/sPuReMD/src/analyze.c
+++ b/sPuReMD/src/analyze.c
@@ -20,12 +20,42 @@
   ----------------------------------------------------------------------*/
 
 #include "analyze.h"
+
 #include "box.h"
 #include "list.h"
 #include "vector.h"
 
+
 #define MAX_FRAGMENT_TYPES 100
 
+
+enum atoms
+{
+    C_ATOM = 0,
+    H_ATOM = 1,
+    O_ATOM = 2,
+    N_ATOM = 3,
+    S_ATOM = 4,
+    SI_ATOM = 5,
+    GE_ATOM = 6,
+    X_ATOM = 7,
+};
+
+enum molecule_type
+{
+    UNKNOWN = 0,
+    WATER = 1,
+};
+
+
+typedef struct
+{
+    int atom_count;
+    int atom_list[MAX_MOLECULE_SIZE];
+    int mtypes[MAX_ATOM_TYPES];
+} molecule;
+
+
 // copy bond list into old bond list
 void Copy_Bond_List( reax_system *system, control_params *control,
                      list **lists )
@@ -772,9 +802,9 @@ void Calculate_Drift( reax_system *system, control_params *control,
             Distance_on_T3_Gen( workspace->x_old[i], system->atoms[i].x,
                                 &(system->box), driftvec );
 
-            if ( fabs( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[1] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
-                    fabs( driftvec[2] ) >= system->box.box_norms[0] / 2.0 - 2.0 )
+            if ( FABS( driftvec[0] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
+                    FABS( driftvec[1] ) >= system->box.box_norms[0] / 2.0 - 2.0 ||
+                    FABS( driftvec[2] ) >= system->box.box_norms[0] / 2.0 - 2.0 )
             {
                 /* the atom has moved almost half the box size.
                    exclude it from further drift computations as it might have an
diff --git a/sPuReMD/src/bond_orders.c b/sPuReMD/src/bond_orders.c
index 348184680ac82d13a2531753c1c4d1c26049f393..686cf3b0318f53fcc735d73884f82bfb753dab19 100644
--- a/sPuReMD/src/bond_orders.c
+++ b/sPuReMD/src/bond_orders.c
@@ -29,9 +29,10 @@
 inline real Cf45( real p1, real p2 )
 {
     return  -EXP(-p2 / 2) /
-            ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
+        ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
 }
 
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system *system, list **lists,
               int i, int pj, real C, rvec *v )
@@ -152,35 +153,41 @@ void Add_dBOpinpi2_to_Forces( reax_system *system, list **lists,
 
 void Add_dDelta( reax_system *system, list **lists, int i, real C, rvec *v )
 {
-    list *dDeltas = &((*lists)[DDELTA]);
-    int start = Start_Index(i, dDeltas);
-    int end = End_Index(i, dDeltas);
-    int k;
+    list *dDeltas;
+    int start, end, k;
+
+    dDeltas = &((*lists)[DDELTA]);
+    start = Start_Index(i, dDeltas);
+    end = End_Index(i, dDeltas);
 
     for ( k = start; k < end; ++k )
+    {
         rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt],
-                        C, dDeltas->select.dDelta_list[k].dVal );
+                C, dDeltas->select.dDelta_list[k].dVal );
+    }
 }
 
 
 void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C )
 {
-    list *dDeltas = &((*lists)[DDELTA]);
-    int start = Start_Index(i, dDeltas);
-    int end = End_Index(i, dDeltas);
-    int k;
+    list *dDeltas;
+    int start, end, k;
+
+    dDeltas = &((*lists)[DDELTA]);
+    start = Start_Index(i, dDeltas);
+    end = End_Index(i, dDeltas);
 
     for ( k = start; k < end; ++k )
+    {
         rvec_ScaledAdd( system->atoms[dDeltas->select.dDelta_list[k].wrt].f,
-                        C, dDeltas->select.dDelta_list[k].dVal );
+                C, dDeltas->select.dDelta_list[k].dVal );
+    }
 }
 
 
-
 void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
-                    int *top )
+        int *top )
 {
-    /* Initializations */
     int j, k, l, start_i, end_i, end_j;
     rvec dDeltap_self, dBOp;
     list *bonds, *dBOs;
@@ -188,34 +195,20 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
     bond_order_data *bo_ij;
     dbond_data *top_dbo;
 
+    /* Initializations */
     bonds = (*lists) + BONDS;
     dBOs = (*lists) + DBO;
-
     j = bonds->select.bond_list[pj].nbr;
     bo_ij = &(bonds->select.bond_list[pj].bo_data);
-
-    /*rvec due_j[1000], due_i[1000];
-      rvec due_j_pi[1000], due_i_pi[1000];
-
-      memset(due_j, 0, sizeof(rvec)*1000 );
-      memset(due_i, 0, sizeof(rvec)*1000 );
-      memset(due_j_pi, 0, sizeof(rvec)*1000 );
-      memset(due_i_pi, 0, sizeof(rvec)*1000 );*/
-
-    //fprintf( stderr,"dbo %d-%d\n",workspace->orig_id[i],workspace->orig_id[j] );
-
-    start_i = Start_Index(i, bonds);
-    end_i = End_Index(i, bonds);
-
-    l = Start_Index(j, bonds);
-    end_j = End_Index(j, bonds);
-
+    start_i = Start_Index( i, bonds );
+    end_i = End_Index( i, bonds );
+    l = Start_Index( j, bonds );
+    end_j = End_Index( j, bonds );
     top_dbo = &(dBOs->select.dbo_list[ (*top) ]);
 
     for ( k = start_i; k < end_i; ++k )
     {
         nbr_k = &(bonds->select.bond_list[k]);
-        //fprintf( stderr, "\tnbr_k = %d\n", workspace->orig_id[nbr_k->nbr] );
 
         for ( ; l < end_j && bonds->select.bond_list[l].nbr < nbr_k->nbr; ++l )
         {
@@ -224,12 +217,10 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
             nbr_l = &(bonds->select.bond_list[l]);
             top_dbo->wrt = nbr_l->nbr;
             rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
-            //fprintf( stderr,"\t\tnbr_l = %d\n",workspace->orig_id[nbr_l->nbr] );
 
             rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp );  // dBO, 3rd
             rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  // dBOpi, 4th
             rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );// dBOpipi, 4th
-            //rvec_ScaledAdd(due_j[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ji, dBOp);
 
             if ( nbr_l->nbr == i )
             {
@@ -240,24 +231,19 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
                 rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd
 
                 /* dBOpi */
-                rvec_ScaledAdd(top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi); //1
-                rvec_ScaledAdd(top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp); //2nd
-                rvec_ScaledAdd(top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self); //3rd
+                rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi ); //1
+                rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp ); //2nd
+                rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self ); //3rd
 
                 /* dBOpp, 1st */
-                rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2);
-                rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp); //2nd
-                rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self); //3rd
-
-                /* do the adjustments on i */
-                //rvec_ScaledAdd( due_i[i],
-                //bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, bo_ij->dBOp );//1st,dBO
-                //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij,
-                //dDeltap_self ); //2nd, dBO
+                rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2 );
+                rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
+                rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self ); //3rd
+
             }
 
-            //rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
-            ++(*top), ++top_dbo;
+            ++(*top);
+            ++top_dbo;
         }
 
         /* Now we are processing neighbor k of i. */
@@ -267,11 +253,6 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
         rvec_Scale( top_dbo->dBO, -bo_ij->C2dbo, dBOp );      //dBO-2
         rvec_Scale( top_dbo->dBOpi, -bo_ij->C3dbopi, dBOp );  //dBOpi-3
         rvec_Scale( top_dbo->dBOpi2, -bo_ij->C3dbopi2, dBOp );//dBOpp-3
-        //rvec_ScaledAdd(due_i[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ij,dBOp);//dBO-2
-
-        // fprintf( stderr, "\tnbr_k = %d, nbr_l = %d, l = %d, end_j = %d\n",
-        //      workspace->orig_id[nbr_k->nbr],
-        //       workspace->orig_id[bonds->select.bond_list[l].nbr], l, end_j );
 
         if ( l < end_j && bonds->select.bond_list[l].nbr == nbr_k->nbr )
         {
@@ -283,9 +264,6 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
             rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //dBOpi,4th
             rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//dBOpp.4th
             ++l;
-
-            //rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji,
-            //nbr_l->bo_data.dBOp ); //3rd, dBO
         }
         else if ( k == pj )
         {
@@ -296,22 +274,16 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
             rvec_ScaledAdd( top_dbo->dBO, bo_ij->C3dbo, dDeltap_self );// 3rd, dBO
 
             /* dBOpi, 1st */
-            rvec_ScaledAdd(top_dbo->dBOpi, -bo_ij->C1dbopi, bo_ij->dln_BOp_pi);
-            rvec_ScaledAdd(top_dbo->dBOpi, -bo_ij->C2dbopi, bo_ij->dBOp);    //2nd
+            rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C1dbopi, bo_ij->dln_BOp_pi );
+            rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C2dbopi, bo_ij->dBOp );    //2nd
             rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C4dbopi, dDeltap_self );  //4th
 
             /* dBOpi2, 1st */
-            rvec_ScaledAdd(top_dbo->dBOpi2, -bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2 );
-            rvec_ScaledAdd(top_dbo->dBOpi2, -bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
-            rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C4dbopi2, dDeltap_self ); //4th
-
-            //rvec_ScaledAdd( due_j[j], -(bo_ij->A0_ij + bo_ij->BO*bo_ij->A1_ij),
-            //bo_ij->dBOp ); //1st, dBO
-            //rvec_ScaledAdd( due_j[j], bo_ij->BO * bo_ij->A2_ji,
-            //workspace->dDeltap_self[j] ); //3rd, dBO
+            rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2 );
+            rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
+            rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C4dbopi2, dDeltap_self ); //4th
         }
 
-        // rvec_Add( workspace->dDelta[nbr_k->nbr], top_dbo->dBO );
         ++(*top), ++top_dbo;
     }
 
@@ -322,15 +294,11 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
         nbr_l = &(bonds->select.bond_list[l]);
         top_dbo->wrt = nbr_l->nbr;
         rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
-        //fprintf( stderr,"\tl=%d, nbr_l=%d\n",l,workspace->orig_id[nbr_l->nbr] );
 
         rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp );      //3rd, dBO
         rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //4th, dBOpi
         rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//4th, dBOpp
 
-        // rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji,
-        // nbr_l->bo_data.dBOp );
-
         if ( nbr_l->nbr == i )
         {
             /* do the adjustments on i */
@@ -349,31 +317,16 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, list **lists,
             rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2);
             rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
             rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self );//3rd
-
-            //rvec_ScaledAdd( due_i[i], bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij,
-            //bo_ij->dBOp );  /*1st, dBO*/
-            //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij,
-            //dDeltap_self ); /*2nd, dBO*/
         }
 
-        // rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
         ++(*top), ++top_dbo;
     }
-
-    /*for( k = 0; k < 21; ++k ){
-      fprintf( stderr, "%d %d %d, due_i:[%g %g %g]\n",
-      i+1, j+1, k+1, due_i[k][0], due_i[k][1], due_i[k][2] );
-      fprintf( stderr, "%d %d %d, due_j:[%g %g %g]\n",
-      i+1, j+1, k+1, due_j[k][0], due_j[k][1], due_j[k][2] );
-      }*/
 }
 #endif
 
 
-
 void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
-                              simulation_data *data, static_storage *workspace,
-                              list **lists )
+        simulation_data *data, static_storage *workspace, list **lists )
 {
     list *bonds = (*lists) + BONDS;
     bond_data *nbr_j, *nbr_k;
@@ -382,12 +335,23 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     rvec temp, ext_press;
     ivec rel_box;
     int pk, k, j;
+    rvec *f_i, *f_j, *f_k;
+#ifdef _OPENMP
+    int tid = omp_get_thread_num( );
+#endif
 
     /* Initializations */
     nbr_j = &(bonds->select.bond_list[pj]);
     j = nbr_j->nbr;
     bo_ij = &(nbr_j->bo_data);
     bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+#ifdef _OPENMP
+    f_i = &(workspace->f_local[tid * system->N + i]);
+    f_j = &(workspace->f_local[tid * system->N + j]);
+#else
+    f_i = &(system->atoms[i].f);
+    f_j = &(system->atoms[j].f);
+#endif
 
     coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
     coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
@@ -407,7 +371,6 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i] + workspace->CdDelta[j]);
     coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i] + workspace->CdDelta[j]);
 
-
     /************************************
     * forces related to atom i          *
     * first neighbors of atom i         *
@@ -416,6 +379,11 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     {
         nbr_k = &(bonds->select.bond_list[pk]);
         k = nbr_k->nbr;
+#ifdef _OPENMP
+        f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+        f_k = &(system->atoms[k].f);
+#endif
 
         rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
         rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
@@ -423,10 +391,15 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
         rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
 
         /* force */
-        rvec_Add( system->atoms[k].f, temp );
+        rvec_Add( *f_k, temp );
         /* pressure */
         rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-        rvec_Add( data->ext_press, ext_press );
+#ifdef _OPENMP
+        #pragma omp critical (Add_dBond_to_Forces_NPT_ext_press)
+#endif
+        {
+            rvec_Add( data->ext_press, ext_press );
+        }
 
         /* if( !ivec_isZero( nbr_k->rel_box ) )
            fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]
@@ -451,15 +424,14 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
     rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
 
-    rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
-    rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
-    rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
+    rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ) ;      /*1st,dBO_pi2*/
+    rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );              /*2nd,dBO_pi2*/
+    rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd,dBO_pi2*/
 
     /* force */
-    rvec_Add( system->atoms[i].f, temp );
+    rvec_Add( *f_i, temp );
     /* ext pressure due to i dropped, counting force on j only will be enough */
 
-
     /****************************************************************************
      * forces and pressure related to atom j                                    *
      * first neighbors of atom j                                                *
@@ -468,6 +440,11 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     {
         nbr_k = &(bonds->select.bond_list[pk]);
         k = nbr_k->nbr;
+#ifdef _OPENMP
+        f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+        f_k = &(system->atoms[k].f);
+#endif
 
         rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
         rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
@@ -475,13 +452,18 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
         rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
 
         /* force */
-        rvec_Add( system->atoms[k].f, temp );
+        rvec_Add( *f_k, temp );
         /* pressure */
         if ( k != i )
         {
-            ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
+            ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box );//k's rel_box  wrt i
             rvec_iMultiply( ext_press, rel_box, temp );
-            rvec_Add( data->ext_press, ext_press );
+#ifdef _OPENMP
+            #pragma omp critical (Add_dBond_to_Forces_NPT_ext_press)
+#endif
+            {
+                rvec_Add( data->ext_press, ext_press );
+            }
 
             /* if( !ivec_isZero( rel_box ) )
             fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]
@@ -507,15 +489,20 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
     rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
     rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
 
-    rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
-    rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
-    rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+    rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );       /*1st,dBOpi2*/
+    rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );              /*2nd,dBOpi2*/
+    rvec_ScaledAdd( temp, coef.C4dbopi2, workspace->dDeltap_self[j] );/*3rd,dBOpi2*/
 
     /* force */
-    rvec_Add( system->atoms[j].f, temp );
+    rvec_Add( *f_j, temp );
     /* pressure */
     rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-    rvec_Add( data->ext_press, ext_press );
+#ifdef _OPENMP
+    #pragma omp critical (Add_dBond_to_Forces_NPT_ext_press)
+#endif
+    {
+        rvec_Add( data->ext_press, ext_press );
+    }
 
     /* if( !ivec_isZero( nbr_j->rel_box ) )
        fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]
@@ -528,22 +515,31 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
 }
 
 
-
 void Add_dBond_to_Forces( int i, int pj, reax_system *system,
-                          simulation_data *data, static_storage *workspace,
-                          list **lists )
+        simulation_data *data, static_storage *workspace, list **lists )
 {
     list *bonds = (*lists) + BONDS;
     bond_data *nbr_j, *nbr_k;
     bond_order_data *bo_ij, *bo_ji;
     dbond_coefficients coef;
     int pk, k, j;
+    rvec *f_i, *f_j, *f_k;
+#ifdef _OPENMP
+    int tid = omp_get_thread_num( );
+#endif
 
     /* Initializations */
     nbr_j = &(bonds->select.bond_list[pj]);
     j = nbr_j->nbr;
     bo_ij = &(nbr_j->bo_data);
     bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+#ifdef _OPENMP
+    f_i = &(workspace->f_local[tid * system->N + i]);
+    f_j = &(workspace->f_local[tid * system->N + j]);
+#else
+    f_i = &(system->atoms[i].f);
+    f_j = &(system->atoms[j].f);
+#endif
 
     coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
     coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
@@ -567,79 +563,88 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system,
     {
         nbr_k = &(bonds->select.bond_list[pk]);
         k = nbr_k->nbr;
+#ifdef _OPENMP
+        f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+        f_k = &(system->atoms[k].f);
+#endif
 
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C2dbo, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C2dbo, nbr_k->bo_data.dBOp );
         /*2nd, dBO*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C2dDelta, nbr_k->bo_data.dBOp );
         /*dDelta*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C3dbopi, nbr_k->bo_data.dBOp );
         /*3rd, dBOpi*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
         /*3rd, dBOpi2*/
     }
 
-    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbo, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_i, coef.C1dbo, bo_ij->dBOp );
     /*1st, dBO*/
-    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
+    rvec_ScaledAdd( *f_i, coef.C2dbo, workspace->dDeltap_self[i] );
     /*2nd, dBO*/
 
-    rvec_ScaledAdd(system->atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
+    rvec_ScaledAdd( *f_i, coef.C1dDelta, bo_ij->dBOp );
     /*1st, dBO*/
-    rvec_ScaledAdd(system->atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
+    rvec_ScaledAdd( *f_i, coef.C2dDelta, workspace->dDeltap_self[i] );
     /*2nd, dBO*/
 
-    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
+    rvec_ScaledAdd( *f_i, coef.C1dbopi, bo_ij->dln_BOp_pi );
     /*1st, dBOpi*/
-    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_i, coef.C2dbopi, bo_ij->dBOp );
     /*2nd, dBOpi*/
-    rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
+    rvec_ScaledAdd( *f_i, coef.C3dbopi, workspace->dDeltap_self[i] );
     /*3rd, dBOpi*/
 
-    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+    rvec_ScaledAdd( *f_i, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
     /*1st, dBO_pi2*/
-    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_i, coef.C2dbopi2, bo_ij->dBOp );
     /*2nd, dBO_pi2*/
-    rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
+    rvec_ScaledAdd( *f_i, coef.C3dbopi2, workspace->dDeltap_self[i] );
     /*3rd, dBO_pi2*/
 
-
     for ( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk )
     {
         nbr_k = &(bonds->select.bond_list[pk]);
         k = nbr_k->nbr;
+#ifdef _OPENMP
+        f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+        f_k = &(system->atoms[k].f);
+#endif
 
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C3dbo, nbr_k->bo_data.dBOp );
         /*3rd, dBO*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C3dDelta, nbr_k->bo_data.dBOp );
         /*dDelta*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C4dbopi, nbr_k->bo_data.dBOp );
         /*4th, dBOpi*/
-        rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+        rvec_ScaledAdd( *f_k, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
         /*4th, dBOpi2*/
     }
 
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbo, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_j, -coef.C1dbo, bo_ij->dBOp );
     /*1st, dBO*/
-    rvec_ScaledAdd( system->atoms[j].f, coef.C3dbo, workspace->dDeltap_self[j] );
+    rvec_ScaledAdd( *f_j, coef.C3dbo, workspace->dDeltap_self[j] );
     /*2nd, dBO*/
 
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dDelta, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_j, -coef.C1dDelta, bo_ij->dBOp );
     /*1st, dBO*/
-    rvec_ScaledAdd(system->atoms[j].f, coef.C3dDelta, workspace->dDeltap_self[j]);
+    rvec_ScaledAdd( *f_j, coef.C3dDelta, workspace->dDeltap_self[j] );
     /*2nd, dBO*/
 
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
+    rvec_ScaledAdd( *f_j, -coef.C1dbopi, bo_ij->dln_BOp_pi );
     /*1st, dBOpi*/
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_j, -coef.C2dbopi, bo_ij->dBOp );
     /*2nd, dBOpi*/
-    rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi, workspace->dDeltap_self[j]);
+    rvec_ScaledAdd( *f_j, coef.C4dbopi, workspace->dDeltap_self[j] );
     /*3rd, dBOpi*/
 
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+    rvec_ScaledAdd( *f_j, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
     /*1st, dBOpi2*/
-    rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi2, bo_ij->dBOp );
+    rvec_ScaledAdd( *f_j, -coef.C2dbopi2, bo_ij->dBOp );
     /*2nd, dBOpi2*/
-    rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi2, workspace->dDeltap_self[j]);
+    rvec_ScaledAdd( *f_j, coef.C4dbopi2, workspace->dDeltap_self[j] );
     /*3rd, dBOpi2*/
 }
 
@@ -659,8 +664,13 @@ int Locate_Symmetric_Bond( list *bonds, int i, int j )
         /*fprintf( stderr, "\tstart: %d   end: %d   mid: %d\n",
         start, end, mid );*/
         if ( mid_nbr < j )
+        {
             start = mid + 1;
-        else end = mid - 1;
+        }
+        else
+        {
+            end = mid - 1;
+        }
 
         mid = (start + end) / 2;
     }
@@ -703,401 +713,432 @@ int compare_bonds( const void *p1, const void *p2 )
    belonging to a different atom in nbrhoods->nbr_list is sorted in its own.
    This can either be done in the general coordinator function or here */
 void Calculate_Bond_Orders( reax_system *system, control_params *control,
-                            simulation_data *data, static_storage *workspace,
-                            list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int i, j, pj, type_i, type_j;
-    int start_i, end_i;
-    int num_bonds, sym_index;
+    real p_lp1;
     real p_boc1, p_boc2;
-    real val_i, Deltap_i, Deltap_boc_i;
-    real val_j, Deltap_j, Deltap_boc_j;
-    real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-    real exp_p1i, exp_p2i, exp_p1j, exp_p2j;
-    real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-    real Cf45_ij, Cf45_ji, p_lp1;
-    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-    real explp1;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij, *bo_ji;
-    single_body_parameters *sbp_i, *sbp_j;
-    list *bonds = (*lists) + BONDS;
-#if defined(TEST_FORCES)
-    int  k, pk, start_j, end_j;
-    int  top_dbo = 0, top_dDelta = 0;
-    dbond_data *pdbo;
-    dDelta_data *ptop_dDelta;
-    list *dDeltas = (*lists) + DDELTA;
-    list *dBOs = (*lists) + DBO;
-#endif
+    list *bonds;
 
-    num_bonds = 0;
+    p_lp1 = system->reaxprm.gp.l[15];
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
+    bonds = (*lists) + BONDS;
 
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    for ( i = 0; i < system->N; ++i )
+#ifdef _OPENMP
+    #pragma omp parallel default(shared)
+#endif
     {
-        type_i = system->atoms[i].type;
-        sbp_i = &(system->reaxprm.sbp[type_i]);
-        workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-        workspace->Deltap_boc[i] =
-            workspace->total_bond_order[i] - sbp_i->valency_val;
-        workspace->total_bond_order[i] = 0;
-    }
-    // fprintf( stderr, "done with uncorrected bond orders\n" );
-
+        int i, j, pj, type_i, type_j;
+        int start_i, end_i;
+        int sym_index;
+        real val_i, Deltap_i, Deltap_boc_i;
+        real val_j, Deltap_j, Deltap_boc_j;
+        real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+        real exp_p1i, exp_p2i, exp_p1j, exp_p2j;
+        real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+        real Cf45_ij, Cf45_ji;
+        real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+        real explp1;
+        two_body_parameters *twbp;
+        bond_order_data *bo_ij, *bo_ji;
+        single_body_parameters *sbp_i, *sbp_j;
+#if defined(TEST_FORCES)
+        int k, pk, start_j, end_j;
+        int top_dbo, top_dDelta;
+        dbond_data *pdbo;
+        dDelta_data *ptop_dDelta;
+        list *dDeltas;
+        list *dBOs;
+
+        top_dbo = 0;
+        top_dDelta = 0;
+        dDeltas = (*lists) + DDELTA;
+        dBOs = (*lists) + DBO;
+#endif
 
-    /* Corrected Bond Order calculations */
-    for ( i = 0; i < system->N; ++i )
-    {
-        type_i = system->atoms[i].type;
-        sbp_i = &(system->reaxprm.sbp[type_i]);
-        val_i = sbp_i->valency;
-        Deltap_i = workspace->Deltap[i];
-        Deltap_boc_i = workspace->Deltap_boc[i];
-        start_i = Start_Index(i, bonds);
-        end_i = End_Index(i, bonds);
-        //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-        //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-
-        for ( pj = start_i; pj < end_i; ++pj )
+        /* Calculate Deltaprime, Deltaprime_boc values */
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < system->N; ++i )
         {
-            j = bonds->select.bond_list[pj].nbr;
-            type_j = system->atoms[j].type;
-            bo_ij = &( bonds->select.bond_list[pj].bo_data );
-            //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+            type_i = system->atoms[i].type;
+            sbp_i = &(system->reaxprm.sbp[type_i]);
+            workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+            workspace->Deltap_boc[i] =
+                workspace->total_bond_order[i] - sbp_i->valency_val;
+            workspace->total_bond_order[i] = 0.0;
+        }
 
-            if ( i < j )
-            {
-                twbp = &( system->reaxprm.tbp[type_i][type_j] );
-#ifdef TEST_FORCES
-                Set_Start_Index( pj, top_dbo, dBOs );
-                /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n",
-                   workspace->reverse_map[i], workspace->reverse_map[j],
-                   twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+        /* wait until initialization complete */
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
+
+        /* Corrected Bond Order calculations */
+#ifdef _OPENMP
+        #pragma omp for schedule(guided)
 #endif
-                if ( twbp->ovc < 0.001 && twbp->v13cor < 0.001 )
+        for ( i = 0; i < system->N; ++i )
+        {
+            type_i = system->atoms[i].type;
+            sbp_i = &(system->reaxprm.sbp[type_i]);
+            val_i = sbp_i->valency;
+            Deltap_i = workspace->Deltap[i];
+            Deltap_boc_i = workspace->Deltap_boc[i];
+            start_i = Start_Index(i, bonds);
+            end_i = End_Index(i, bonds);
+
+            for ( pj = start_i; pj < end_i; ++pj )
+            {
+                j = bonds->select.bond_list[pj].nbr;
+                type_j = system->atoms[j].type;
+                bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+                if ( i < j )
                 {
-                    /* There is no correction to bond orders nor to derivatives of
-                       bond order prime! So we leave bond orders unchanged and
-                       set derivative of bond order coefficients s.t.
-                       dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-                    bo_ij->C1dbo = 1.000000;
-                    bo_ij->C2dbo = 0.000000;
-                    bo_ij->C3dbo = 0.000000;
-
-                    bo_ij->C1dbopi = bo_ij->BO_pi;
-                    bo_ij->C2dbopi = 0.000000;
-                    bo_ij->C3dbopi = 0.000000;
-                    bo_ij->C4dbopi = 0.000000;
-
-                    bo_ij->C1dbopi2 = bo_ij->BO_pi2;
-                    bo_ij->C2dbopi2 = 0.000000;
-                    bo_ij->C3dbopi2 = 0.000000;
-                    bo_ij->C4dbopi2 = 0.000000;
+                    twbp = &( system->reaxprm.tbp[type_i][type_j] );
 
 #ifdef TEST_FORCES
-                    pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
-
-                    // compute dBO_ij/dr_i
-                    pdbo->wrt = i;
-                    rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-                    rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-                    rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
-
-                    // compute dBO_ij/dr_j
-                    pdbo++;
-                    pdbo->wrt = j;
-                    rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
-                    rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-                    rvec_Scale( pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
-
-                    top_dbo += 2;
+                    Set_Start_Index( pj, top_dbo, dBOs );
+                    /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n",
+                       workspace->reverse_map[i], workspace->reverse_map[j],
+                       twbp->ovc, twbp->v13cor, bo_ij->BO ); */
 #endif
-                }
-                else
-                {
-                    val_j = system->reaxprm.sbp[type_j].valency;
-                    Deltap_j = workspace->Deltap[j];
-                    Deltap_boc_j = workspace->Deltap_boc[j];
 
-                    /* on page 1 */
-                    if ( twbp->ovc >= 0.001 )
+                    if ( twbp->ovc < 0.001 && twbp->v13cor < 0.001 )
                     {
-                        /* Correction for overcoordination */
-                        exp_p1i = EXP( -p_boc1 * Deltap_i );
-                        exp_p2i = EXP( -p_boc2 * Deltap_i );
-                        exp_p1j = EXP( -p_boc1 * Deltap_j );
-                        exp_p2j = EXP( -p_boc2 * Deltap_j );
-
-                        f2 = exp_p1i + exp_p1j;
-                        f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-                        f1 = 0.5 * ( ( val_i + f2 ) / ( val_i + f2 + f3 ) +
-                                     ( val_j + f2 ) / ( val_j + f2 + f3 ) );
-
-                        /*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
-                          i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
-                          fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n",
-                          f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-
-                        /* Now come the derivates */
-                        /* Bond Order pages 5-7, derivative of f1 */
-                        temp = f2 + f3;
-                        u1_ij = val_i + temp;
-                        u1_ji = val_j + temp;
-                        Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
-                        Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
-                                          ( u1_ji - f3 ) / SQR( u1_ji ));
-
-                        //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i +
-                        //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-                        Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
-                                          ((val_i + f2) / SQR(u1_ij)) *
-                                          ( -p_boc1 * exp_p1i +
-                                            exp_p2i / ( exp_p2i + exp_p2j ) ) +
-                                          -p_boc1 * exp_p1i / u1_ji -
-                                          ((val_j + f2) / SQR(u1_ji)) * ( -p_boc1 * exp_p1i +
-                                                  exp_p2i / ( exp_p2i + exp_p2j ) ));
-
-                        Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
-                                 Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
-                        //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                        /* There is no correction to bond orders nor to derivatives of
+                           bond order prime! So we leave bond orders unchanged and
+                           set derivative of bond order coefficients s.t.
+                           dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                        bo_ij->C1dbo = 1.000000;
+                        bo_ij->C2dbo = 0.000000;
+                        bo_ij->C3dbo = 0.000000;
+
+                        bo_ij->C1dbopi = bo_ij->BO_pi;
+                        bo_ij->C2dbopi = 0.000000;
+                        bo_ij->C3dbopi = 0.000000;
+                        bo_ij->C4dbopi = 0.000000;
+
+                        bo_ij->C1dbopi2 = bo_ij->BO_pi2;
+                        bo_ij->C2dbopi2 = 0.000000;
+                        bo_ij->C3dbopi2 = 0.000000;
+                        bo_ij->C4dbopi2 = 0.000000;
+
+#ifdef TEST_FORCES
+                        pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
+
+                        /* compute dBO_ij/dr_i */
+                        pdbo->wrt = i;
+                        rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                        rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                        rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
+
+                        /* compute dBO_ij/dr_j */
+                        pdbo++;
+                        pdbo->wrt = j;
+                        rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
+                        rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                        rvec_Scale( pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
+
+                        top_dbo += 2;
+#endif
                     }
                     else
                     {
-                        /* No overcoordination correction! */
-                        f1 = 1.0;
-                        Cf1_ij = Cf1_ji = 0.0;
+                        val_j = system->reaxprm.sbp[type_j].valency;
+                        Deltap_j = workspace->Deltap[j];
+                        Deltap_boc_j = workspace->Deltap_boc[j];
+
+                        /* on page 1 */
+                        if ( twbp->ovc >= 0.001 )
+                        {
+                            /* Correction for overcoordination */
+                            exp_p1i = EXP( -p_boc1 * Deltap_i );
+                            exp_p2i = EXP( -p_boc2 * Deltap_i );
+                            exp_p1j = EXP( -p_boc1 * Deltap_j );
+                            exp_p2j = EXP( -p_boc2 * Deltap_j );
+
+                            f2 = exp_p1i + exp_p1j;
+                            f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                            f1 = 0.5 * ( ( val_i + f2 ) / ( val_i + f2 + f3 ) +
+                                    ( val_j + f2 ) / ( val_j + f2 + f3 ) );
+
+                            /* Now come the derivates */
+                            /* Bond Order pages 5-7, derivative of f1 */
+                            temp = f2 + f3;
+                            u1_ij = val_i + temp;
+                            u1_ji = val_j + temp;
+                            Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
+                            Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
+                                    ( u1_ji - f3 ) / SQR( u1_ji ));
+
+                            //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i +
+                            //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                            Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
+                                    ((val_i + f2) / SQR(u1_ij)) * ( -p_boc1 * exp_p1i +
+                                    exp_p2i / ( exp_p2i + exp_p2j ) ) + -p_boc1 * exp_p1i / u1_ji -
+                                    ((val_j + f2) / SQR(u1_ji)) * ( -p_boc1 * exp_p1i +
+                                    exp_p2i / ( exp_p2i + exp_p2j ) ));
+
+                            Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
+                                Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
+                        }
+                        else
+                        {
+                            /* No overcoordination correction! */
+                            f1 = 1.0;
+                            Cf1_ij = Cf1_ji = 0.0;
+                        }
+
+                        if ( twbp->v13cor >= 0.001 )
+                        {
+                            /* Correction for 1-3 bond orders */
+                            exp_f4 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                           Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                            exp_f5 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                           Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+
+                            f4 = 1. / (1. + exp_f4);
+                            f5 = 1. / (1. + exp_f5);
+                            f4f5 = f4 * f5;
+
+                            /* Bond Order pages 8-9, derivative of f4 and f5 */
+                            /*temp = twbp->p_boc5 -
+                              twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
+                              u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
+                              u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
+                              Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
+                              Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
+                            Cf45_ij = -f4 * exp_f4;
+                            Cf45_ji = -f5 * exp_f5;
+                        }
+                        else
+                        {
+                            f4 = f5 = f4f5 = 1.0;
+                            Cf45_ij = Cf45_ji = 0.0;
+                        }
+
+                        /* Bond Order page 10, derivative of total bond order */
+                        A0_ij = f1 * f4f5;
+                        A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
+                            (Cf45_ij + Cf45_ji);
+                        A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                        A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                        A3_ij = A2_ij + Cf1_ij / f1;
+                        A3_ji = A2_ji + Cf1_ji / f1;
+
+                        /* find corrected bond order values and their deriv coefs */
+                        bo_ij->BO = bo_ij->BO * A0_ij;
+                        bo_ij->BO_pi = bo_ij->BO_pi * A0_ij * f1;
+                        bo_ij->BO_pi2 = bo_ij->BO_pi2 * A0_ij * f1;
+                        bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+
+                        bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                        bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                        bo_ij->C3dbo = bo_ij->BO * A2_ji;
+
+                        bo_ij->C1dbopi = f1 * f1 * f4 * f5;
+                        bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                        bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                        bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+
+                        bo_ij->C1dbopi2 = f1 * f1 * f4 * f5;
+                        bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                        bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                        bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+
+#ifdef TEST_FORCES
+                        /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
+                          i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
+
+                        /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
+                        //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
+                        workspace->orig_id[i], workspace->orig_id[j]
+                        A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
+                        bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
+                        bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
+                        bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
+                        bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
+                        ); */
+
+                        Calculate_dBO( i, pj, workspace, lists, &top_dbo );
+#endif
                     }
 
-                    if ( twbp->v13cor >= 0.001 )
+                    /* neglect bonds that are < 1e-10 */
+                    if ( bo_ij->BO < 1e-10 )
                     {
-                        /* Correction for 1-3 bond orders */
-                        exp_f4 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-                                       Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-                        exp_f5 = EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-                                       Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-
-                        f4 = 1. / (1. + exp_f4);
-                        f5 = 1. / (1. + exp_f5);
-                        f4f5 = f4 * f5;
-
-                        /* Bond Order pages 8-9, derivative of f4 and f5 */
-                        /*temp = twbp->p_boc5 -
-                          twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-                          u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-                          u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-                          Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-                          Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-                        Cf45_ij = -f4 * exp_f4;
-                        Cf45_ji = -f5 * exp_f5;
+                        bo_ij->BO = 0.0;
                     }
-                    else
+                    if ( bo_ij->BO_s < 1e-10 )
                     {
-                        f4 = f5 = f4f5 = 1.0;
-                        Cf45_ij = Cf45_ji = 0.0;
+                        bo_ij->BO_s = 0.0;
+                    }
+                    if ( bo_ij->BO_pi < 1e-10 )
+                    {
+                        bo_ij->BO_pi = 0.0;
+                    }
+                    if ( bo_ij->BO_pi2 < 1e-10 )
+                    {
+                        bo_ij->BO_pi2 = 0.0;
                     }
 
-                    /* Bond Order page 10, derivative of total bond order */
-                    A0_ij = f1 * f4f5;
-                    A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
-                            (Cf45_ij + Cf45_ji);
-                    A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-                    A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-                    A3_ij = A2_ij + Cf1_ij / f1;
-                    A3_ji = A2_ji + Cf1_ji / f1;
-
-                    /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f
-                      A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
-                      bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
-
+                    workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
 
-                    /* find corrected bond order values and their deriv coefs */
-                    bo_ij->BO    = bo_ij->BO    * A0_ij;
-                    bo_ij->BO_pi = bo_ij->BO_pi * A0_ij * f1;
-                    bo_ij->BO_pi2 = bo_ij->BO_pi2 * A0_ij * f1;
-                    bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+#ifdef TEST_FORCES
+                    Set_End_Index( pj, top_dbo, dBOs );
+                    Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+#endif
+                }
+            }
 
-                    bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-                    bo_ij->C2dbo = bo_ij->BO * A2_ij;
-                    bo_ij->C3dbo = bo_ij->BO * A2_ji;
+#ifdef TEST_FORCES
+            Set_Start_Index( i, top_dDelta, dDeltas );
+            ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] );
 
-                    bo_ij->C1dbopi = f1 * f1 * f4 * f5;
-                    bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-                    bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-                    bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+            for ( pj = start_i; pj < end_i; ++pj )
+            {
+                j = bonds->select.bond_list[pj].nbr;
 
-                    bo_ij->C1dbopi2 = f1 * f1 * f4 * f5;
-                    bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-                    bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
-                    bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+                if ( !rvec_isZero( workspace->dDelta[j] ) )
+                {
+                    ptop_dDelta->wrt = j;
+                    rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] );
+                    rvec_MakeZero( workspace->dDelta[j] );
+                    ++top_dDelta, ++ptop_dDelta;
+                }
 
-#ifdef TEST_FORCES
-                    /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
-                      i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
-
-                    /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
-                    //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
-                    workspace->orig_id[i], workspace->orig_id[j]
-                    A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
-                    bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
-                    bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
-                    bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
-                    bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
-                    ); */
-
-                    Calculate_dBO( i, pj, workspace, lists, &top_dbo );
-#endif
+                start_j = Start_Index(j, bonds);
+                end_j = End_Index(j, bonds);
+                for ( pk = start_j; pk < end_j; ++pk )
+                {
+                    k = bonds->select.bond_list[pk].nbr;
+                    if ( !rvec_isZero( workspace->dDelta[k] ) )
+                    {
+                        ptop_dDelta->wrt = k;
+                        rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] );
+                        rvec_MakeZero( workspace->dDelta[k] );
+                        ++top_dDelta, ++ptop_dDelta;
+                    }
                 }
+            }
 
-                /* neglect bonds that are < 1e-10 */
-                if ( bo_ij->BO < 1e-10 )
-                    bo_ij->BO = 0.0;
-                if ( bo_ij->BO_s < 1e-10 )
-                    bo_ij->BO_s = 0.0;
-                if ( bo_ij->BO_pi < 1e-10 )
-                    bo_ij->BO_pi = 0.0;
-                if ( bo_ij->BO_pi2 < 1e-10 )
-                    bo_ij->BO_pi2 = 0.0;
-
-                workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-
-
-                /* fprintf( stderr, "%d %d\t%g %g %g %g\n
-                   Cdbo:\t%g %g %g\n
-                   Cdbopi:\t%g %g %g %g\n
-                   Cdbopi2:%g %g %g %g\n\n",
-                   i+1, j+1, bonds->select.bond_list[ pj ].d,
-                   bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2,
-                   bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
-                   bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi,
-                   bo_ij->C1dbopi2, bo_ij->C2dbopi2,
-                   bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
-
-                /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-                   i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
+            Set_End_Index( i, top_dDelta, dDeltas );
 
-#ifdef TEST_FORCES
-                Set_End_Index( pj, top_dbo, dBOs );
-                Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+            /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
+              fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
+              i+1, dDeltas->select.dDelta_list[pj].wrt+1,
+              dDeltas->select.dDelta_list[pj].dVal[0],
+              dDeltas->select.dDelta_list[pj].dVal[1],
+              dDeltas->select.dDelta_list[pj].dVal[2] );*/
 #endif
-            }
-            else
-            {
-                /* We only need to update bond orders from bo_ji
-                   everything else is set in uncorrected_bo calculations */
-                sym_index = bonds->select.bond_list[pj].sym_index;
-                bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-                bo_ij->BO = bo_ji->BO;
-                bo_ij->BO_s = bo_ji->BO_s;
-                bo_ij->BO_pi = bo_ji->BO_pi;
-                bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-                workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-#ifdef TEST_FORCES
-                Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
-#endif
-            }
         }
 
-#ifdef TEST_FORCES
-        // fprintf( stderr, "dDelta computations\nj:" );
-        Set_Start_Index( i, top_dDelta, dDeltas );
-        ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] );
+        /* wait for bo_ij to be updated */
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
 
-        for ( pj = start_i; pj < end_i; ++pj )
+#ifdef _OPENMP
+        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < system->N; ++i )
         {
-            j = bonds->select.bond_list[pj].nbr;
-            // fprintf( stderr, "%d  ", j );
-
-            if ( !rvec_isZero( workspace->dDelta[j] ) )
+            type_i = system->atoms[i].type;
+            if ( type_i < 0 )
             {
-                ptop_dDelta->wrt = j;
-                rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] );
-                rvec_MakeZero( workspace->dDelta[j] );
-                ++top_dDelta, ++ptop_dDelta;
+                continue;
             }
+            start_i = Start_Index(i, bonds);
+            end_i = End_Index(i, bonds);
 
-            start_j = Start_Index(j, bonds);
-            end_j = End_Index(j, bonds);
-            for ( pk = start_j; pk < end_j; ++pk )
+            for ( pj = start_i; pj < end_i; ++pj )
             {
-                k = bonds->select.bond_list[pk].nbr;
-                if ( !rvec_isZero( workspace->dDelta[k] ) )
+                j = bonds->select.bond_list[pj].nbr;
+                type_j = system->atoms[j].type;
+                if ( type_j < 0 )
                 {
-                    ptop_dDelta->wrt = k;
-                    rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] );
-                    rvec_MakeZero( workspace->dDelta[k] );
-                    ++top_dDelta, ++ptop_dDelta;
+                    continue;
                 }
-            }
-        }
 
-        Set_End_Index( i, top_dDelta, dDeltas );
+                if ( i < j )
+                {
+                    /* computed in previous for-loop */
+                }
+                else
+                {
+                    /* We only need to update bond orders from bo_ji
+                       everything else is set in uncorrected_bo calculations */
+                    sym_index = bonds->select.bond_list[pj].sym_index;
 
-        /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
-          fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
-          i+1, dDeltas->select.dDelta_list[pj].wrt+1,
-          dDeltas->select.dDelta_list[pj].dVal[0],
-          dDeltas->select.dDelta_list[pj].dVal[1],
-          dDeltas->select.dDelta_list[pj].dVal[2] );*/
-#endif
-    }
+                    bo_ij = &(bonds->select.bond_list[ pj ].bo_data);
+                    bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+                    bo_ij->BO = bo_ji->BO;
+                    bo_ij->BO_s = bo_ji->BO_s;
+                    bo_ij->BO_pi = bo_ji->BO_pi;
+                    bo_ij->BO_pi2 = bo_ji->BO_pi2;
 
-    /*fprintf(stderr,"\tCalculated actual bond orders ...\n" );
-      fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n",
-      "atom", "Delta", "Delta_e", "Delta_boc", "nlp",
-      "Delta_lp", "Clp", "dDelta_lp" );*/
+                    /* now keeps total_BO */
+                    workspace->total_bond_order[i] += bo_ij->BO;
 
-    p_lp1 = system->reaxprm.gp.l[15];
-    /* Calculate some helper variables that are  used at many places
-       throughout force calculations */
-    for ( j = 0; j < system->N; ++j )
-    {
-        type_j = system->atoms[j].type;
-        sbp_j = &(system->reaxprm.sbp[ type_j ]);
-
-        workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
-        workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
-        workspace->Delta_boc[j] = workspace->total_bond_order[j] -
-                                  sbp_j->valency_boc;
-
-        workspace->vlpex[j] =  workspace->Delta_e[j] -
-                               2.0 * (int)(workspace->Delta_e[j] / 2.0);
-        explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
-        workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
-        workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
-        workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
-        /* Adri uses different dDelta_lp values than the ones in notes... */
-        workspace->dDelta_lp[j] = workspace->Clp[j];
-        //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-        //((fabs(workspace->Delta_e[j]/2.0 -
-        //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-
-        if ( sbp_j->mass > 21.0 )
-        {
-            workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-            workspace->dDelta_lp_temp[j] = 0.;
+#ifdef TEST_FORCES
+                    Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
+#endif
+                }
+            }
         }
-        else
+
+        /* need to wait for total_bond_order to be accumulated */
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
+
+        /* Calculate some helper variables that are  used at many places
+           throughout force calculations */
+#ifdef _OPENMP
+        #pragma omp for schedule(guided)
+#endif
+        for ( j = 0; j < system->N; ++j )
         {
-            workspace->nlp_temp[j] = workspace->nlp[j];
-            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-            workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+            type_j = system->atoms[j].type;
+            sbp_j = &(system->reaxprm.sbp[ type_j ]);
+
+            workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
+            workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
+            workspace->Delta_boc[j] = workspace->total_bond_order[j] -
+                sbp_j->valency_boc;
+
+            workspace->vlpex[j] = workspace->Delta_e[j] -
+                2.0 * (int)(workspace->Delta_e[j] / 2.0);
+            explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
+            workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
+            workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
+            workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
+            /* Adri uses different dDelta_lp values than the ones in notes... */
+            workspace->dDelta_lp[j] = workspace->Clp[j];
+            //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+            //((FABS(workspace->Delta_e[j]/2.0 -
+            //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+
+            if ( sbp_j->mass > 21.0 )
+            {
+                workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+                workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+                workspace->dDelta_lp_temp[j] = 0.0;
+            }
+            else
+            {
+                workspace->nlp_temp[j] = workspace->nlp[j];
+                workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+                workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+            }
         }
-
-        //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
-        //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j],
-        //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
-        //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
     }
 
-    //Print_Bonds( system, bonds, "sbonds.out" );
-
 #if defined(DEBUG)
-    fprintf( stderr, "Number of bonds: %d\n", num_bonds );
     Print_Bond_Orders( system, control, data, workspace, lists, out_control );
 #endif
 }
diff --git a/sPuReMD/src/box.c b/sPuReMD/src/box.c
index 6c2fda0ba797a4472333aceaa7ebc9e19ecad8df..3cddaf2d36f7ad9092540f9e9e47b049d798b614 100644
--- a/sPuReMD/src/box.c
+++ b/sPuReMD/src/box.c
@@ -190,9 +190,13 @@ void Update_Box( rtensor box_tensor, simulation_box* box )
 {
     int i, j;
 
-    for (i = 0; i < 3; i++)
-        for (j = 0; j < 3; j++)
+    for ( i = 0; i < 3; i++ )
+    {
+        for ( j = 0; j < 3; j++ )
+        {
             box->box[i][j] = box_tensor[i][j];
+        }
+    }
 
     Make_Consistent( box );
 }
@@ -240,15 +244,20 @@ void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
     {
         tmp = x[i] + dx[i];
         if ( tmp <= -box->box_norms[i] || tmp >= box->box_norms[i] )
+        {
             tmp = FMOD( tmp, box->box_norms[i] );
+        }
 
-        if ( tmp < 0 ) tmp += box->box_norms[i];
+        if ( tmp < 0 )
+        {
+            tmp += box->box_norms[i];
+        }
         x[i] = tmp;
     }
 }
 
 
-real Sq_Distance_on_T3(rvec x1, rvec x2, simulation_box* box, rvec r)
+real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r )
 {
     real norm = 0.0;
     real d, tmp;
@@ -262,9 +271,13 @@ real Sq_Distance_on_T3(rvec x1, rvec x2, simulation_box* box, rvec r)
         if ( tmp >= SQR( box->box_norms[i] / 2.0 ) )
         {
             if (x2[i] > x1[i])
+            {
                 d -= box->box_norms[i];
+            }
             else
+            {
                 d += box->box_norms[i];
+            }
 
             r[i] = d;
             norm += SQR(d);
@@ -323,7 +336,9 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
     {
         tmp = 0.0;
         for ( j = 0; j < 3; j++ )
+        {
             tmp += box->g[i][j] * x2[j];
+        }
         dist += x1[i] * tmp;
     }
 
@@ -332,7 +347,7 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 
 
 int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
-                       real cutoff, far_neighbor_data *data )
+        real cutoff, far_neighbor_data *data )
 {
     real norm_sqr, d, tmp;
     int i;
@@ -368,9 +383,9 @@ int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
         }
     }
 
-    if ( norm_sqr <= SQR(cutoff) )
+    if ( norm_sqr <= SQR( cutoff ) )
     {
-        data->d = sqrt(norm_sqr);
+        data->d = SQRT( norm_sqr );
         return 1;
     }
 
@@ -382,13 +397,11 @@ int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
    If so, this neighborhood is added to the list of far neighbors.
    Periodic boundary conditions do not apply. */
 void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
-                                    control_params *control,
-                                    far_neighbor_data *new_nbrs, int *count )
+        control_params *control, far_neighbor_data *new_nbrs, int *count )
 {
     real norm_sqr;
 
     rvec_ScaledSum( new_nbrs[0].dvec, 1.0, x2, -1.0, x1 );
-
     norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec );
 
     if ( norm_sqr <= SQR( control->vlist_cut ) )
@@ -399,7 +412,10 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
         ivec_MakeZero( new_nbrs[0].rel_box );
         // rvec_MakeZero( new_nbrs[0].ext_factor );
     }
-    else *count = 0;
+    else
+    {
+        *count = 0;
+    }
 }
 
 
@@ -408,9 +424,7 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
    If the periodic distance between x1 and x2 is than vlist_cut, this
    neighborhood is added to the list of far neighbors. */
 void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
-        control_params *control,
-        far_neighbor_data *periodic_nbrs,
-        int *count )
+        control_params *control, far_neighbor_data *periodic_nbrs, int *count )
 {
     real norm_sqr, d, tmp;
     int i;
@@ -456,7 +470,10 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
         *count = 1;
         periodic_nbrs[0].d = SQRT( norm_sqr );
     }
-    else *count = 0;
+    else
+    {
+        *count = 0;
+    }
 }
 
 
@@ -468,9 +485,7 @@ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box,
    periodic images of x2 that are two boxs away!!!
 */
 void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box,
-        control_params *control,
-        far_neighbor_data *periodic_nbrs,
-        int *count )
+        control_params *control, far_neighbor_data *periodic_nbrs, int *count )
 {
     int i, j, k;
     int imax, jmax, kmax;
@@ -489,13 +504,16 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
 
 
     for ( i = -imax; i <= imax; ++i )
-        if (fabs(d_i = ((x2[0] + i * box->box_norms[0]) - x1[0])) <= control->vlist_cut)
+    {
+        if (FABS(d_i = ((x2[0] + i * box->box_norms[0]) - x1[0])) <= control->vlist_cut)
         {
             for ( j = -jmax; j <= jmax; ++j )
-                if (fabs(d_j = ((x2[1] + j * box->box_norms[1]) - x1[1])) <= control->vlist_cut)
+            {
+                if (FABS(d_j = ((x2[1] + j * box->box_norms[1]) - x1[1])) <= control->vlist_cut)
                 {
                     for ( k = -kmax; k <= kmax; ++k )
-                        if (fabs(d_k = ((x2[2] + k * box->box_norms[2]) - x1[2])) <= control->vlist_cut)
+                    {
+                        if (FABS(d_k = ((x2[2] + k * box->box_norms[2]) - x1[2])) <= control->vlist_cut)
                         {
                             sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k);
                             if ( sqr_norm <= SQR(control->vlist_cut) )
@@ -533,8 +551,11 @@ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box
                                 ++(*count);
                             }
                         }
+                    }
                 }
+            }
         }
+    }
 }
 
 
@@ -586,7 +607,9 @@ void Print_Box( simulation_box* box, FILE *out )
     {
         fprintf( out, "{" );
         for ( j = 0; j < 3; ++j )
+        {
             fprintf( out, "%8.3f ", box->box[i][j] );
+        }
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
@@ -600,7 +623,9 @@ void Print_Box( simulation_box* box, FILE *out )
     {
         fprintf( out, "{" );
         for ( j = 0; j < 3; ++j )
+        {
             fprintf( out, "%8.3f ", box->trans[i][j] );
+        }
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
@@ -610,7 +635,9 @@ void Print_Box( simulation_box* box, FILE *out )
     {
         fprintf( out, "{" );
         for ( j = 0; j < 3; ++j )
+        {
             fprintf( out, "%8.3f ", box->trans_inv[i][j] );
+        }
         fprintf( out, "}" );
     }
     fprintf( out, "}\n" );
diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
new file mode 100644
index 0000000000000000000000000000000000000000..07641a6993f93737b01781d8d62a7e7bc0e4b123
--- /dev/null
+++ b/sPuReMD/src/charges.c
@@ -0,0 +1,2839 @@
+/*----------------------------------------------------------------------
+  SerialReax - Reax Force Field Simulator
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "charges.h"
+
+#include "allocate.h"
+#include "list.h"
+#include "lin_alg.h"
+#include "print_utils.h"
+#include "tool_box.h"
+#include "vector.h"
+#if defined(HAVE_SUPERLU_MT)
+#include "slu_mt_ddefs.h"
+#endif
+
+
+typedef struct
+{
+    unsigned int j;
+    real val;
+} sparse_matrix_entry;
+
+
+#if defined(TEST_MAT)
+static sparse_matrix * create_test_mat( void )
+{
+    unsigned int i, n;
+    sparse_matrix *H_test;
+
+    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    //3x3, SPD, store lower half
+    i = 0;
+    n = 0;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 4.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 12.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = 37.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = -16.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = -43.;
+    ++i;
+    H_test->j[i] = 2;
+    H_test->val[i] = 98.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+
+    return H_test;
+}
+#endif
+
+
+/* Routine used with qsort for sorting nonzeros within a sparse matrix row
+ *
+ * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
+ */
+static int compare_matrix_entry(const void *v1, const void *v2)
+{
+    /* larger element has larger column index */
+    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
+}
+
+
+/* Routine used for sorting nonzeros within a sparse matrix row;
+ *  internally, a combination of qsort and manual sorting is utilized
+ *  (parallel calls to qsort when multithreading, rows mapped to threads)
+ *
+ * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
+ */
+static void Sort_Matrix_Rows( sparse_matrix * const A )
+{
+    unsigned int i, j, si, ei;
+    sparse_matrix_entry *temp;
+
+#ifdef _OPENMP
+//    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
+#endif
+    {
+        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
+        {
+            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        /* sort each row of A using column indices */
+#ifdef _OPENMP
+//        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+            ei = A->start[i + 1];
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                (temp + j)->j = A->j[si + j];
+                (temp + j)->val = A->val[si + j];
+            }
+
+            /* polymorphic sort in standard C library using column indices */
+            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                A->j[si + j] = (temp + j)->j;
+                A->val[si + j] = (temp + j)->val;
+            }
+        }
+
+        free( temp );
+    }
+}
+
+
+static void Calculate_Droptol( const sparse_matrix * const A,
+        real * const droptol, const real dtol )
+{
+    int i, j, k;
+    real val;
+#ifdef _OPENMP
+    static real *droptol_local;
+    unsigned int tid;
+#endif
+
+#ifdef _OPENMP
+    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
+#endif
+    {
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+
+        #pragma omp master
+        {
+            /* keep b_local for program duration to avoid allocate/free
+             * overhead per Sparse_MatVec call*/
+            if ( droptol_local == NULL )
+            {
+                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
+                {
+                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* init droptol to 0 */
+        for ( i = 0; i < A->n; ++i )
+        {
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] = 0.0;
+#else
+            droptol[i] = 0.0;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
+
+        /* calculate sqaure of the norm of each row */
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
+            {
+                j = A->j[k];
+                val = A->val[k];
+
+#ifdef _OPENMP
+                droptol_local[tid * A->n + i] += val * val;
+                droptol_local[tid * A->n + j] += val * val;
+#else
+                droptol[i] += val * val;
+                droptol[j] += val * val;
+#endif
+            }
+
+            // diagonal entry
+            val = A->val[k];
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] += val * val;
+#else
+            droptol[i] += val * val;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            droptol[i] = 0.0;
+            for ( k = 0; k < omp_get_num_threads(); ++k )
+            {
+                droptol[i] += droptol_local[k * A->n + i];
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* calculate local droptol for each row */
+        //fprintf( stderr, "droptol: " );
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            //fprintf( stderr, "%f-->", droptol[i] );
+            droptol[i] = SQRT( droptol[i] ) * dtol;
+            //fprintf( stderr, "%f  ", droptol[i] );
+        }
+        //fprintf( stderr, "\n" );
+    }
+}
+
+
+static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
+{
+    int i, pj;
+    int fillin;
+    real val;
+
+    fillin = 0;
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, pj, val) reduction(+: fillin)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                ++fillin;
+            }
+        }
+    }
+
+    return fillin + A->n;
+}
+
+
+#if defined(HAVE_SUPERLU_MT)
+static real SuperLU_Factorize( const sparse_matrix * const A,
+                               sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, pj, count, *Ltop, *Utop, r;
+    sparse_matrix *A_t;
+    SuperMatrix A_S, AC_S, L_S, U_S;
+    NCformat *A_S_store;
+    SCPformat *L_S_store;
+    NCPformat *U_S_store;
+    superlumt_options_t superlumt_options;
+    pxgstrf_shared_t pxgstrf_shared;
+    pdgstrf_threadarg_t *pdgstrf_threadarg;
+    int_t nprocs;
+    fact_t fact;
+    trans_t trans;
+    yes_no_t refact, usepr;
+    real u, drop_tol;
+    real *a, *at;
+    int_t *asub, *atsub, *xa, *xat;
+    int_t *perm_c; /* column permutation vector */
+    int_t *perm_r; /* row permutations from partial pivoting */
+    void *work;
+    int_t info, lwork;
+    int_t permc_spec, panel_size, relax;
+    Gstat_t Gstat;
+    flops_t flopcnt;
+
+    /* Default parameters to control factorization. */
+#ifdef _OPENMP
+    //TODO: set as global parameter and use
+    #pragma omp parallel \
+        default(none) shared(nprocs)
+    {
+        #pragma omp master
+        {
+            /* SuperLU_MT spawns threads internally, so set and pass parameter */
+            nprocs = omp_get_num_threads();
+        }
+    }
+#else
+    nprocs = 1;
+#endif
+
+//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
+    fact = DOFACT; /* factor from scratch */
+    trans = NOTRANS;
+    refact = NO; /* first time factorization */
+    //TODO: add to control file and use the value there to set these
+    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
+    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
+    u = 1.0; /* diagonal pivoting threshold */
+    usepr = NO;
+    drop_tol = 0.0;
+    work = NULL;
+    lwork = 0;
+
+//#if defined(DEBUG)
+    fprintf( stderr, "nprocs = %d\n", nprocs );
+    fprintf( stderr, "Panel size = %d\n", panel_size );
+    fprintf( stderr, "Relax = %d\n", relax );
+//#endif
+
+    if ( !(perm_r = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_r[].");
+    }
+    if ( !(perm_c = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_c[].");
+    }
+    if ( !(superlumt_options.etree = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for etree[].");
+    }
+    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
+    }
+    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for part_super__h[].");
+    }
+    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
+            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
+            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
+            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
+    {
+        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
+    count = 0;
+    for ( i = 0; i < A->n; ++i )
+    {
+        xa[i] = count;
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            a[count] = A->entries[pj].val;
+            asub[count] = A->entries[pj].j;
+            ++count;
+        }
+        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            a[count] = A_t->entries[pj].val;
+            asub[count] = A_t->entries[pj].j;
+            ++count;
+        }
+    }
+    xa[i] = count;
+
+    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
+                         &at, &atsub, &xat );
+
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", asub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", a[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xa[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", atsub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", at[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xat[i] );
+    fprintf( stderr, "\n" );
+
+    A_S.Stype = SLU_NC; /* column-wise, no supernode */
+    A_S.Dtype = SLU_D; /* double-precision */
+    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
+    A_S.nrow = A->n;
+    A_S.ncol = A->n;
+    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    A_S_store = (NCformat *) A_S.Store;
+    A_S_store->nnz = 2 * A->start[A->n] - A->n;
+    A_S_store->nzval = at;
+    A_S_store->rowind = atsub;
+    A_S_store->colptr = xat;
+
+    /* ------------------------------------------------------------
+       Allocate storage and initialize statistics variables.
+       ------------------------------------------------------------*/
+    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
+    StatInit( A->n, nprocs, &Gstat );
+
+    /* ------------------------------------------------------------
+       Get column permutation vector perm_c[], according to permc_spec:
+       permc_spec = 0: natural ordering
+       permc_spec = 1: minimum degree ordering on structure of A'*A
+       permc_spec = 2: minimum degree ordering on structure of A'+A
+       permc_spec = 3: approximate minimum degree for unsymmetric matrices
+       ------------------------------------------------------------*/
+    permc_spec = 0;
+    get_perm_c( permc_spec, &A_S, perm_c );
+
+    /* ------------------------------------------------------------
+       Initialize the option structure superlumt_options using the
+       user-input parameters;
+       Apply perm_c to the columns of original A to form AC.
+       ------------------------------------------------------------*/
+    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
+                  u, usepr, drop_tol, perm_c, perm_r,
+                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
+
+    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    /* ------------------------------------------------------------
+       Compute the LU factorization of A.
+       The following routine will create nprocs threads.
+       ------------------------------------------------------------*/
+    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
+
+    fprintf( stderr, "INFO: %d\n", info );
+
+    flopcnt = 0;
+    for (i = 0; i < nprocs; ++i)
+    {
+        flopcnt += Gstat.procstat[i].fcops;
+    }
+    Gstat.ops[FACT] = flopcnt;
+
+//#if defined(DEBUG)
+    printf("\n** Result of sparse LU **\n");
+    L_S_store = (SCPformat *) L_S.Store;
+    U_S_store = (NCPformat *) U_S.Store;
+    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
+    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
+    fflush( stdout );
+//#endif
+
+    /* convert L and R from SuperLU formats to CSR */
+    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
+    memset( Utop, 0, (A->n + 1) * sizeof(int) );
+    memset( L->start, 0, (A->n + 1) * sizeof(int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(int) );
+
+    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
+    for ( i = 0; i < A->n; ++i )
+    {
+        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
+        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
+        //TODO: correct for SCPformat for L?
+        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
+//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
+//        {
+//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
+//        }
+        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
+        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            ++Utop[U_S_store->rowind[pj] + 1];
+            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
+        }
+    }
+    for ( i = 1; i <= A->n; ++i )
+    {
+//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
+        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
+//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
+//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
+    }
+    for ( i = 0; i < A->n; ++i )
+    {
+//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
+//        {
+//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
+//            L->entries[Ltop[r]].j = r;
+//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
+//            ++Ltop[r];
+//        }
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            r = U_S_store->rowind[pj];
+            U->entries[Utop[r]].j = i;
+            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
+            ++Utop[r];
+        }
+    }
+
+    /* ------------------------------------------------------------
+      Deallocate storage after factorization.
+      ------------------------------------------------------------*/
+    pxgstrf_finalize( &superlumt_options, &AC_S );
+    Deallocate_Matrix( A_t );
+    free( xa );
+    free( asub );
+    free( a );
+    SUPERLU_FREE( perm_r );
+    SUPERLU_FREE( perm_c );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
+    SUPERLU_FREE( A_S.Store );
+    if ( lwork == 0 )
+    {
+        Destroy_SuperNode_SCP(&L_S);
+        Destroy_CompCol_NCP(&U_S);
+    }
+    else if ( lwork > 0 )
+    {
+        SUPERLU_FREE(work);
+    }
+    StatFree(&Gstat);
+
+    free( Utop );
+    free( Ltop );
+
+    //TODO: return iters
+    return 0.;
+}
+#endif
+
+
+/* Diagonal (Jacobi) preconditioner computation */
+static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
+{
+    unsigned int i;
+    real start;
+
+    start = Get_Time( );
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i)
+#endif
+    for ( i = 0; i < H->n; ++i )
+    {
+        if ( H->val[H->start[i + 1] - 1] != 0.0 )
+        {
+            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
+        }
+        else
+        {
+            Hdia_inv[i] = 1.0;
+        }
+    }
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Incomplete Cholesky factorization with dual thresholding */
+static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    int *tmp_j;
+    real *tmp_val;
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val, start;
+    unsigned int *Utop;
+
+    start = Get_Time( );
+
+    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
+            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
+            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( Utop, 0, A->n * sizeof(unsigned int) );
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            j = A->j[pj];
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                k1 = 0;
+                k2 = L->start[j];
+                while ( k1 < tmptop && k2 < L->start[j + 1] )
+                {
+                    if ( tmp_j[k1] < L->j[k2] )
+                    {
+                        ++k1;
+                    }
+                    else if ( tmp_j[k1] > L->j[k2] )
+                    {
+                        ++k2;
+                    }
+                    else
+                    {
+                        val -= (tmp_val[k1++] * L->val[k2++]);
+                    }
+                }
+
+                // L matrix is lower triangular,
+                // so right before the start of next row comes jth diagonal
+                val /= L->val[L->start[j + 1] - 1];
+
+                tmp_j[tmptop] = j;
+                tmp_val[tmptop] = val;
+                ++tmptop;
+            }
+        }
+
+        // sanity check
+        if ( A->j[pj] != i )
+        {
+            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
+            exit( NUMERIC_BREAKDOWN );
+        }
+
+        // compute the ith diagonal in L
+        val = A->val[pj];
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            val -= (tmp_val[k1] * tmp_val[k1]);
+        }
+
+#if defined(DEBUG)
+        if ( val < 0.0 )
+        {
+            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
+            exit( NUMERIC_BREAKDOWN );
+
+        }
+#endif
+
+        tmp_j[tmptop] = i;
+        tmp_val[tmptop] = SQRT( val );
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
+            {
+                L->j[Ltop] = tmp_j[k1];
+                L->val[Ltop] = tmp_val[k1];
+                U->start[tmp_j[k1] + 1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        }
+        // keep the diagonal in any case
+        L->j[Ltop] = tmp_j[k1];
+        L->val[Ltop] = tmp_val[k1];
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    /* U = L^T (Cholesky factorization) */
+    Transpose( L, U );
+//    for ( i = 1; i <= U->n; ++i )
+//    {
+//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
+//    }
+//    for ( i = 0; i < L->n; ++i )
+//    {
+//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
+//        {
+//            j = L->j[pj];
+//            U->j[Utop[j]] = i;
+//            U->val[Utop[j]] = L->val[pj];
+//            Utop[j]++;
+//        }
+//    }
+
+//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+
+    free( tmp_val );
+    free( tmp_j );
+    free( Utop );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete Cholesky factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp. */
+#if defined(TESTING)
+static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+                       sparse_matrix * const U_t, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+    int *Utop;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D_inv, D) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1. / D_inv[i];
+    }
+
+    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
+    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(DAD, D_inv, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.;
+    }
+
+    /* initial guesses for U^T,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
+    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
+#endif
+        for ( j = 0; j < A->start[A->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 0; k <= A->n; ++k )
+            {
+                if ( U_t->start[k] > j )
+                {
+                    x = U_t->start[k - 1];
+                    ei_x = U_t->start[k];
+                    break;
+                }
+            }
+            /* column bounds of current nonzero */
+            y = U_t->start[U_t->j[j]];
+            ei_y = U_t->start[U_t->j[j] + 1];
+
+            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
+            while ( U_t->j[x] < U_t->j[j] &&
+                    U_t->j[y] < U_t->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_t->j[x] == U_t->j[y] )
+                {
+                    sum += (U_t->val[x] * U_t->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_t->j[x] < U_t->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            sum = DAD->val[j] - sum;
+
+            /* diagonal entries */
+            if ( (k - 1) == U_t->j[j] )
+            {
+                /* sanity check */
+                if ( sum < ZERO )
+                {
+                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
+#if defined(DEBUG_FOCUS)
+                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
+                             k - 1, A->entries[j].j, A->entries[j].val );
+                    fprintf( stderr, "sum = %10.3f\n", sum);
+#endif
+                    exit(NUMERIC_BREAKDOWN);
+                }
+
+                U_t->val[j] = SQRT( sum );
+            }
+            /* non-diagonal entries */
+            else
+            {
+                U_t->val[j] = sum / U_t->val[ei_y - 1];
+            }
+        }
+    }
+
+    /* apply inverse transformation D^{-1}U^{T},
+     * since DAD \approx U^{T}U, so
+     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            U_t->val[pj] *= D_inv[i];
+        }
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
+#endif
+
+    /* transpose U^{T} and copy into U */
+    Transpose( U_t, U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    free(D_inv);
+    free(D);
+    free(Utop);
+
+    return Get_Timing_Info( start );
+}
+#endif
+
+
+/* Fine-grained (parallel) incomplete LU factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+//        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) default(none) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->val[L->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L->j[x] < L->j[j] &&
+                    L->j[y] < L->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L->j[x] == L->j[y] )
+                {
+                    sum += (L->val[x] * U->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L->j[x] < L->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U->j[x] < U->j[j] &&
+                    U->j[y] < U->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U->j[x] == U->j[y] )
+                {
+                    sum += (L->val[y] * U->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U->j[x] < U->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L->val[pj] = D_inv[i] * L->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U->val[pj] = U->val[pj] * D_inv[i];
+        }
+    }
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    free( D_inv );
+    free( D );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete LU factorization with thresholding
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * droptol: row-wise tolerances used for dropping
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
+                      const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD, *L_temp, *U_temp;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i) shared(L_temp)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L_temp->j[x] < L_temp->j[j] &&
+                    L_temp->j[y] < L_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L_temp->j[x] == L_temp->j[y] )
+                {
+                    sum += (L_temp->val[x] * U_temp->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L_temp->j[x] < L_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U_temp->j[x] < U_temp->j[j] &&
+                    U_temp->j[y] < U_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_temp->j[x] == U_temp->j[y] )
+                {
+                    sum += (L_temp->val[y] * U_temp->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_temp->j[x] < U_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U_temp->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
+        }
+    }
+
+    /* apply the dropping rule */
+    Ltop = 0;
+    Utop = 0;
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        L->start[i] = Ltop;
+        U->start[i] = Utop;
+
+        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
+            {
+                L->j[Ltop] = L_temp->j[pj];
+                L->val[Ltop] = L_temp->val[pj];
+                ++Ltop;
+            }
+        }
+
+        /* diagonal */
+        L->j[Ltop] = L_temp->j[pj];
+        L->val[Ltop] = L_temp->val[pj];
+        ++Ltop;
+
+        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
+            {
+                U->j[Utop] = U_temp->j[pj];
+                U->val[Utop] = U_temp->val[pj];
+                ++Utop;
+            }
+        }
+
+        /* diagonal */
+        U->j[Utop] = U_temp->j[pj];
+        U->val[Utop] = U_temp->val[pj];
+        ++Utop;
+    }
+
+    L->start[i] = Ltop;
+    U->start[i] = Utop;
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
+    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
+#endif
+
+    Deallocate_Matrix( U_temp );
+    Deallocate_Matrix( L_temp );
+    Deallocate_Matrix( DAD );
+    free( D_inv );
+    free( D );
+
+    return Get_Timing_Info( start );
+}
+
+
+static void Extrapolate_Charges_QEq( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace )
+{
+    int i;
+    real s_tmp, t_tmp;
+
+    /* extrapolation for s & t */
+    //TODO: good candidate for vectorization, avoid moving data with head pointer and circular buffer
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, s_tmp, t_tmp)
+#endif
+    for ( i = 0; i < system->N_cm; ++i )
+    {
+        // no extrapolation
+        //s_tmp = workspace->s[0][i];
+        //t_tmp = workspace->t[0][i];
+
+        // linear
+        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+
+        // quadratic
+        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+        t_tmp = workspace->t[2][i] + 3 * (workspace->t[0][i] - workspace->t[1][i]);
+
+        // cubic
+        s_tmp = 4 * (workspace->s[0][i] + workspace->s[2][i]) -
+                (6 * workspace->s[1][i] + workspace->s[3][i] );
+        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) -
+        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+
+        // 4th order
+        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) +
+        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) +
+        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
+
+        workspace->s[4][i] = workspace->s[3][i];
+        workspace->s[3][i] = workspace->s[2][i];
+        workspace->s[2][i] = workspace->s[1][i];
+        workspace->s[1][i] = workspace->s[0][i];
+        workspace->s[0][i] = s_tmp;
+
+        workspace->t[4][i] = workspace->t[3][i];
+        workspace->t[3][i] = workspace->t[2][i];
+        workspace->t[2][i] = workspace->t[1][i];
+        workspace->t[1][i] = workspace->t[0][i];
+        workspace->t[0][i] = t_tmp;
+    }
+}
+
+
+static void Extrapolate_Charges_EE( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace )
+{
+    int i;
+    real s_tmp;
+
+    /* extrapolation for s */
+    //TODO: good candidate for vectorization, avoid moving data with head pointer and circular buffer
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, s_tmp)
+#endif
+    for ( i = 0; i < system->N_cm; ++i )
+    {
+        // no extrapolation
+        //s_tmp = workspace->s[0][i];
+
+        // linear
+        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+
+        // quadratic
+        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+
+        // cubic
+        s_tmp = 4 * (workspace->s[0][i] + workspace->s[2][i]) -
+                (6 * workspace->s[1][i] + workspace->s[3][i] );
+
+        // 4th order
+        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) +
+        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+
+        workspace->s[4][i] = workspace->s[3][i];
+        workspace->s[3][i] = workspace->s[2][i];
+        workspace->s[2][i] = workspace->s[1][i];
+        workspace->s[1][i] = workspace->s[0][i];
+        workspace->s[0][i] = s_tmp;
+    }
+}
+
+
+/* Compute preconditioner for QEq
+ */
+static void Compute_Preconditioner_QEq( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    time = Get_Time( );
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = setup_graph_coloring( workspace->H_sp );
+        }
+        else
+        {
+            Hptr = setup_graph_coloring( workspace->H );
+        }
+
+        Sort_Matrix_Rows( Hptr );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            data->timing.cm_solver_pre_comp +=
+                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            break;
+
+        case ICHOLT_PC:
+            data->timing.cm_solver_pre_comp +=
+                ICHOLT( Hptr, workspace->droptol, workspace->L, workspace->U );
+            break;
+
+        case ILU_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILU_PAR( Hptr, control->cm_solver_pre_comp_sweeps, workspace->L, workspace->U );
+            break;
+
+        case ILUT_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUT_PAR( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
+                        workspace->L, workspace->U );
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+#if defined(HAVE_SUPERLU_MT)
+            data->timing.cm_solver_pre_comp +=
+                SuperLU_Factorize( Hptr, workspace->L, workspace->U );
+#else
+            fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+#if defined(DEBUG)
+    if ( control->cm_solver_pre_comp_type != NONE_PC && 
+            control->cm_solver_pre_comp_type != DIAG_PC )
+    {
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+
+#if defined(DEBUG_FOCUS)
+        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->L, fname, NULL );
+        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->U, fname, NULL );
+#endif
+    }
+#endif
+}
+
+
+/* Compute preconditioner for EE
+ */
+//static void Compute_Preconditioner_EE( const reax_system * const system,
+//        const control_params * const control,
+//        simulation_data * const data, static_storage * const workspace,
+//        const list * const far_nbrs )
+//{
+//    int i, top;
+//    static real * ones = NULL, * x = NULL, * y = NULL;
+//    sparse_matrix *Hptr;
+//
+//    Hptr = workspace->H_EE;
+//
+//#if defined(TEST_MAT)
+//    Hptr = create_test_mat( );
+//#endif
+//
+//    if ( ones == NULL )
+//    {
+//        if ( ( ones = (real*) malloc( system->N * sizeof(real)) ) == NULL ||
+//            ( x = (real*) malloc( system->N * sizeof(real)) ) == NULL ||
+//            ( y = (real*) malloc( system->N * sizeof(real)) ) == NULL )
+//        {
+//            fprintf( stderr, "Not enough space for preconditioner computation. Terminating...\n" );
+//            exit( INSUFFICIENT_MEMORY );
+//        }
+//
+//        for ( i = 0; i < system->N; ++i )
+//        {
+//            ones[i] = 1.0;
+//        }
+//    }
+//
+//    switch ( control->cm_solver_pre_comp_type )
+//    {
+//    case DIAG_PC:
+//        data->timing.cm_solver_pre_comp +=
+//            diag_pre_comp( Hptr, workspace->Hdia_inv );
+//        break;
+//
+//    case ICHOLT_PC:
+//        data->timing.cm_solver_pre_comp +=
+//            ICHOLT( Hptr, workspace->droptol, workspace->L_EE, workspace->U_EE );
+//        break;
+//
+//    case ILU_PAR_PC:
+//        data->timing.cm_solver_pre_comp +=
+//            ILU_PAR( Hptr, control->cm_solver_pre_comp_sweeps, workspace->L_EE, workspace->U_EE );
+//        break;
+//
+//    case ILUT_PAR_PC:
+//        data->timing.cm_solver_pre_comp +=
+//            ILUT_PAR( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
+//                    workspace->L_EE, workspace->U_EE );
+//        break;
+//
+//    case ILU_SUPERLU_MT_PC:
+//#if defined(HAVE_SUPERLU_MT)
+//        data->timing.cm_solver_pre_comp +=
+//            SuperLU_Factorize( Hptr, workspace->L_EE, workspace->U_EE );
+//#else
+//        fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+//        exit( INVALID_INPUT );
+//#endif
+//        break;
+//
+//    default:
+//        fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+//        exit( INVALID_INPUT );
+//        break;
+//    }
+//
+//    if ( control->cm_solver_pre_comp_type != DIAG_PC )
+//    {
+//        switch ( control->cm_solver_pre_app_type )
+//        {
+//            case TRI_SOLVE_PA:
+//                tri_solve( workspace->L_EE, ones, x, workspace->L_EE->n, LOWER );
+//                Transpose_I( workspace->U_EE );
+//                tri_solve( workspace->U_EE, ones, y, workspace->U_EE->n, LOWER );
+//                Transpose_I( workspace->U_EE );
+//
+//                memcpy( workspace->L->start, workspace->L_EE->start, sizeof(unsigned int) * (system->N + 1) );
+//                memcpy( workspace->L->j, workspace->L_EE->j, sizeof(unsigned int) * workspace->L_EE->start[workspace->L_EE->n] );
+//                memcpy( workspace->L->val, workspace->L_EE->val, sizeof(real) * workspace->L_EE->start[workspace->L_EE->n] );
+//
+//                top = workspace->L->start[system->N];
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->L->j[top] = i;
+//                    workspace->L->val[top] = x[i];
+//                    ++top;
+//                }
+//
+//                workspace->L->j[top] = system->N_cm - 1;
+//                workspace->L->val[top] = 1.0;
+//                ++top;
+//
+//                workspace->L->start[system->N_cm] = top;
+//
+//                top = 0;
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->U->start[i] = top;
+//                    memcpy( workspace->U->j + top, workspace->U_EE->j + workspace->U_EE->start[i],
+//                            sizeof(unsigned int) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    memcpy( workspace->U->val + top, workspace->U_EE->val + workspace->U_EE->start[i],
+//                            sizeof(real) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    top += (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]);
+//
+//                    workspace->U->j[top] = system->N_cm - 1;
+//                    workspace->U->val[top] = y[i];
+//                    ++top;
+//                }
+//
+//                workspace->U->start[system->N_cm - 1] = top;
+//
+//                workspace->U->j[top] = system->N_cm - 1;
+//                workspace->U->val[top] = -Dot( x, y, system->N );
+//                ++top;
+//
+//                workspace->U->start[system->N_cm] = top;
+//                break;
+//
+//            case TRI_SOLVE_LEVEL_SCHED_PA:
+//                tri_solve_level_sched( workspace->L_EE, ones, x, workspace->L_EE->n, LOWER, TRUE );
+//                Transpose_I( workspace->U_EE );
+//                tri_solve_level_sched( workspace->U_EE, ones, y, workspace->U_EE->n, LOWER, TRUE );
+//                Transpose_I( workspace->U_EE );
+//
+//                memcpy( workspace->L->start, workspace->L_EE->start, sizeof(unsigned int) * (system->N + 1) );
+//                memcpy( workspace->L->j, workspace->L_EE->j, sizeof(unsigned int) * workspace->L_EE->start[workspace->L_EE->n] );
+//                memcpy( workspace->L->val, workspace->L_EE->val, sizeof(real) * workspace->L_EE->start[workspace->L_EE->n] );
+//
+//                top = workspace->L->start[system->N];
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->L->j[top] = i;
+//                    workspace->L->val[top] = x[i];
+//                    ++top;
+//                }
+//
+//                workspace->L->j[top] = system->N_cm - 1;
+//                workspace->L->val[top] = 1.0;
+//                ++top;
+//
+//                workspace->L->start[system->N_cm] = top;
+//
+//                top = 0;
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->U->start[i] = top;
+//                    memcpy( workspace->U->j + top, workspace->U_EE->j + workspace->U_EE->start[i],
+//                            sizeof(unsigned int) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    memcpy( workspace->U->val + top, workspace->U_EE->val + workspace->U_EE->start[i],
+//                            sizeof(real) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    top += (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]);
+//
+//                    workspace->U->j[top] = system->N_cm - 1;
+//                    workspace->U->val[top] = y[i];
+//                    ++top;
+//                }
+//
+//                workspace->U->start[system->N_cm - 1] = top;
+//
+//                workspace->U->j[top] = system->N_cm - 1;
+//                workspace->U->val[top] = -Dot( x, y, system->N );
+//                ++top;
+//
+//                workspace->U->start[system->N_cm] = top;
+//                break;
+//
+//            //TODO: add Jacobi iter, etc.?
+//            default:
+//                tri_solve( workspace->L_EE, ones, x, workspace->L_EE->n, LOWER );
+//                Transpose_I( workspace->U_EE );
+//                tri_solve( workspace->U_EE, ones, y, workspace->U_EE->n, LOWER );
+//                Transpose_I( workspace->U_EE );
+//
+//                memcpy( workspace->L->start, workspace->L_EE->start, sizeof(unsigned int) * (system->N + 1) );
+//                memcpy( workspace->L->j, workspace->L_EE->j, sizeof(unsigned int) * workspace->L_EE->start[workspace->L_EE->n] );
+//                memcpy( workspace->L->val, workspace->L_EE->val, sizeof(real) * workspace->L_EE->start[workspace->L_EE->n] );
+//
+//                top = workspace->L->start[system->N];
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->L->j[top] = i;
+//                    workspace->L->val[top] = x[i];
+//                    ++top;
+//                }
+//
+//                workspace->L->j[top] = system->N_cm - 1;
+//                workspace->L->val[top] = 1.0;
+//                ++top;
+//
+//                workspace->L->start[system->N_cm] = top;
+//
+//                top = 0;
+//                for ( i = 0; i < system->N; ++i )
+//                {
+//                    workspace->U->start[i] = top;
+//                    memcpy( workspace->U->j + top, workspace->U_EE->j + workspace->U_EE->start[i],
+//                            sizeof(unsigned int) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    memcpy( workspace->U->val + top, workspace->U_EE->val + workspace->U_EE->start[i],
+//                            sizeof(real) * (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]) );
+//                    top += (workspace->U_EE->start[i + 1] - workspace->U_EE->start[i]);
+//
+//                    workspace->U->j[top] = system->N_cm - 1;
+//                    workspace->U->val[top] = y[i];
+//                    ++top;
+//                }
+//
+//                workspace->U->start[system->N_cm - 1] = top;
+//
+//                workspace->U->j[top] = system->N_cm - 1;
+//                workspace->U->val[top] = -Dot( x, y, system->N );
+//                ++top;
+//
+//                workspace->U->start[system->N_cm] = top;
+//                break;
+//        }
+//    }
+//
+//#if defined(DEBUG)
+//    if ( control->cm_solver_pre_comp_type != DIAG_PC )
+//    {
+//        fprintf( stderr, "condest = %f\n", condest(workspace->L) );
+//
+//#if defined(DEBUG_FOCUS)
+//        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+//        Print_Sparse_Matrix2( workspace->L, fname, NULL );
+//        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+//        Print_Sparse_Matrix2( workspace->U, fname, NULL );
+//
+//        fprintf( stderr, "icholt-" );
+//        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+//        Print_Sparse_Matrix2( workspace->L, fname, NULL );
+//        Print_Sparse_Matrix( U );
+//#endif
+//    }
+//#endif
+//}
+
+
+/* Compute preconditioner for EE
+ */
+static void Compute_Preconditioner_EE( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    time = Get_Time( );
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = setup_graph_coloring( workspace->H_sp );
+        }
+        else
+        {
+            Hptr = setup_graph_coloring( workspace->H );
+        }
+
+        Sort_Matrix_Rows( Hptr );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            data->timing.cm_solver_pre_comp +=
+                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            break;
+
+        case ICHOLT_PC:
+            data->timing.cm_solver_pre_comp +=
+                ICHOLT( Hptr, workspace->droptol, workspace->L, workspace->U );
+            break;
+
+        case ILU_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILU_PAR( Hptr, control->cm_solver_pre_comp_sweeps, workspace->L, workspace->U );
+            break;
+
+        case ILUT_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUT_PAR( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
+                        workspace->L, workspace->U );
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+#if defined(HAVE_SUPERLU_MT)
+            data->timing.cm_solver_pre_comp +=
+                SuperLU_Factorize( Hptr, workspace->L, workspace->U );
+#else
+            fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+
+#if defined(DEBUG)
+    if ( control->cm_solver_pre_comp_type != NONE_PC && 
+            control->cm_solver_pre_comp_type != DIAG_PC )
+    {
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+
+#if defined(DEBUG_FOCUS)
+        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->L, fname, NULL );
+        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->U, fname, NULL );
+#endif
+    }
+#endif
+}
+
+
+/* Compute preconditioner for ACKS2
+ */
+static void Compute_Preconditioner_ACKS2( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    time = Get_Time( );
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = setup_graph_coloring( workspace->H_sp );
+        }
+        else
+        {
+            Hptr = setup_graph_coloring( workspace->H );
+        }
+
+        Sort_Matrix_Rows( Hptr );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+    
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            data->timing.cm_solver_pre_comp +=
+                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            break;
+
+        case ICHOLT_PC:
+            data->timing.cm_solver_pre_comp +=
+                ICHOLT( Hptr, workspace->droptol, workspace->L, workspace->U );
+            break;
+
+        case ILU_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILU_PAR( Hptr, control->cm_solver_pre_comp_sweeps, workspace->L, workspace->U );
+            break;
+
+        case ILUT_PAR_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUT_PAR( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
+                        workspace->L, workspace->U );
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+#if defined(HAVE_SUPERLU_MT)
+            data->timing.cm_solver_pre_comp +=
+                SuperLU_Factorize( Hptr, workspace->L, workspace->U );
+#else
+            fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+
+#if defined(DEBUG)
+    if ( control->cm_solver_pre_comp_type != NONE_PC || 
+            control->cm_solver_pre_comp_type != DIAG_PC )
+    {
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+
+#if defined(DEBUG_FOCUS)
+        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->L, fname, NULL );
+        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->U, fname, NULL );
+#endif
+    }
+#endif
+}
+
+
+/* Setup routines before computing the preconditioner for QEq
+ */
+static void Setup_Preconditioner_QEq( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    int fillin;
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = Get_Time( );
+    Sort_Matrix_Rows( workspace->H );
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Sort_Matrix_Rows( workspace->H_sp );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(DEBUG)
+    fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            if ( workspace->Hdia_inv == NULL )
+            {
+                if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            break;
+
+        case ICHOLT_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
+
+#if defined(DEBUG)
+            fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, fillin ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_PAR_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILUT_PAR_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+}
+
+
+/* Setup routines before computing the preconditioner for EE
+ */
+static void Setup_Preconditioner_EE( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    int fillin;
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    /* sorted H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = Get_Time( );
+    Sort_Matrix_Rows( workspace->H );
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Sort_Matrix_Rows( workspace->H_sp );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+
+#if defined(DEBUG)
+    fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            if ( workspace->Hdia_inv == NULL )
+            {
+                if ( ( workspace->Hdia_inv = (real *) calloc( system->N_cm, sizeof( real ) ) ) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            break;
+
+        case ICHOLT_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
+
+#if defined(DEBUG)
+            fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                if ( Allocate_Matrix( &(workspace->L), system->N_cm, fillin + system->N_cm ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), system->N_cm, fillin + system->N_cm ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_PAR_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILUT_PAR_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+}
+
+
+/* Setup routines before computing the preconditioner for ACKS2
+ */
+static void Setup_Preconditioner_ACKS2( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    int fillin;
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = Get_Time( );
+    Sort_Matrix_Rows( workspace->H );
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Sort_Matrix_Rows( workspace->H_sp );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+
+#if defined(DEBUG)
+    fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+    switch ( control->cm_solver_pre_comp_type )
+    {
+        case NONE_PC:
+            break;
+
+        case DIAG_PC:
+            if ( workspace->Hdia_inv == NULL )
+            {
+                if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            break;
+
+        case ICHOLT_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
+
+#if defined(DEBUG)
+            fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, fillin ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_PAR_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILUT_PAR_PC:
+            Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+            if ( workspace->L == NULL )
+            {
+                /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        case ILU_SUPERLU_MT_PC:
+            if ( workspace->L == NULL )
+            {
+                /* factors have sparsity pattern as H */
+                if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                        Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+                {
+                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+            else
+            {
+                //TODO: reallocate
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+}
+
+
+/* Combine ficticious charges s and t to get atomic charge q for QEq method
+ */
+static void Calculate_Charges_QEq( const reax_system * const system,
+        static_storage * const workspace )
+{
+    int i;
+    real u, s_sum, t_sum;
+
+    s_sum = t_sum = 0.;
+    for ( i = 0; i < system->N_cm; ++i )
+    {
+        s_sum += workspace->s[0][i];
+        t_sum += workspace->t[0][i];
+    }
+
+    u = s_sum / t_sum;
+    for ( i = 0; i < system->N_cm; ++i )
+    {
+        system->atoms[i].q = workspace->s[0][i] - u * workspace->t[0][i];
+
+#if defined(DEBUG_FOCUS)
+        printf("atom %4d: %f\n", i, system->atoms[i].q);
+        printf("  x[0]: %10.5f, x[1]: %10.5f, x[2]:  %10.5f\n",
+                system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2]);
+#endif
+    }
+}
+
+
+/* Get atomic charge q for EE method
+ */
+static void Calculate_Charges_EE( const reax_system * const system,
+        static_storage * const workspace )
+{
+    int i;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        system->atoms[i].q = workspace->s[0][i];
+    }
+}
+
+
+/* Main driver method for QEq kernel
+ *
+ * Rough outline:
+ *  1) init / setup routines for preconditioning of linear solver
+ *  2) compute preconditioner
+ *  3) extrapolate charges
+ *  4) perform 2 linear solves
+ *  5) compute atomic charges based on output of (4)
+ */
+static void QEq( reax_system * const system, control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs, const output_controls * const out_control )
+{
+    int iters;
+
+    if ( control->cm_solver_pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
+        
+    {
+        Setup_Preconditioner_QEq( system, control, data, workspace, far_nbrs );
+
+        Compute_Preconditioner_QEq( system, control, data, workspace, far_nbrs );
+    }
+
+    Extrapolate_Charges_QEq( system, control, data, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+        iters = GMRES( workspace, control, data, workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+        iters += GMRES( workspace, control, data, workspace->H,
+                workspace->b_t, control->cm_solver_q_err, workspace->t[0], FALSE );
+        break;
+
+    case GMRES_H_S:
+        iters = GMRES_HouseHolder( workspace, control, data, workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+        iters += GMRES_HouseHolder( workspace, control, data, workspace->H,
+                workspace->b_t, control->cm_solver_q_err, workspace->t[0], 0 );
+        break;
+
+    case CG_S:
+        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        iters += CG( workspace, control, workspace->H, workspace->b_t, control->cm_solver_q_err,
+                workspace->t[0], FALSE ) + 1;
+        break;
+
+    case SDM_S:
+        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        iters += SDM( workspace,control,  workspace->H, workspace->b_t, control->cm_solver_q_err,
+                      workspace->t[0], FALSE ) + 1;
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    data->timing.cm_solver_iters += iters;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+#endif
+
+    Calculate_Charges_QEq( system, workspace );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", data->step,
+       workspace->s[0][0], workspace->t[0][0],
+       workspace->s[0][1], workspace->t[0][1],
+       workspace->s[0][2], workspace->t[0][2] );
+    if( data->step == control->nsteps )
+    {
+        Print_Charges( system, control, workspace, data->step );
+    }
+#endif
+}
+
+
+/* Main driver method for EE kernel
+ *
+ * Rough outline:
+ *  1) init / setup routines for preconditioning of linear solver
+ *  2) compute preconditioner
+ *  3) extrapolate charges
+ *  4) perform 1 linear solve
+ *  5) compute atomic charges based on output of (4)
+ */
+static void EE( reax_system * const system, control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs, const output_controls * const out_control )
+{
+    int iters;
+
+    if ( control->cm_solver_pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
+    {
+        Setup_Preconditioner_EE( system, control, data, workspace, far_nbrs );
+
+        Compute_Preconditioner_EE( system, control, data, workspace, far_nbrs );
+    }
+
+    Extrapolate_Charges_EE( system, control, data, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+        iters = GMRES( workspace, control, data, workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+        break;
+
+    case GMRES_H_S:
+        iters = GMRES_HouseHolder( workspace, control, data,workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 );
+        break;
+
+    case CG_S:
+        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        break;
+
+    case SDM_S:
+        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized EE solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    data->timing.cm_solver_iters += iters;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+#endif
+
+    Calculate_Charges_EE( system, workspace );
+
+    // if( data->step == control->nsteps )
+    //Print_Charges( system, control, workspace, data->step );
+}
+
+
+/* Main driver method for ACKS2 kernel
+ *
+ * Rough outline:
+ *  1) init / setup routines for preconditioning of linear solver
+ *  2) compute preconditioner
+ *  3) extrapolate charges
+ *  4) perform 1 linear solve
+ *  5) compute atomic charges based on output of (4)
+ */
+static void ACKS2( reax_system * const system, control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs, const output_controls * const out_control )
+{
+    int iters;
+
+    if ( control->cm_solver_pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
+    {
+        Setup_Preconditioner_ACKS2( system, control, data, workspace, far_nbrs );
+
+        Compute_Preconditioner_ACKS2( system, control, data, workspace, far_nbrs );
+    }
+
+//   Print_Linear_System( system, control, workspace, data->step );
+
+    Extrapolate_Charges_EE( system, control, data, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+        iters = GMRES( workspace, control, data, workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+        break;
+
+    case GMRES_H_S:
+        iters = GMRES_HouseHolder( workspace, control, data,workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 );
+        break;
+
+    case CG_S:
+        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        break;
+
+    case SDM_S:
+        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
+                 (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized ACKS2 solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    data->timing.cm_solver_iters += iters;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+#endif
+
+    Calculate_Charges_EE( system, workspace );
+}
+
+
+void Compute_Charges( reax_system * const system, control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs, const output_controls * const out_control )
+{
+#if defined(DEBUG_FOCUS)
+    char fname[200];
+    FILE * fp;
+
+    if ( data->step >= 100 )
+    {
+        sprintf( fname, "s_%d_%s.out", data->step, control->sim_name );
+        fp = fopen( fname, "w" );
+        Vector_Print( fp, NULL, workspace->s[0], system->N_cm );
+        fclose( fp );
+
+        sprintf( fname, "t_%d_%s.out", data->step, control->sim_name );
+        fp = fopen( fname, "w" );
+        Vector_Print( fp, NULL, workspace->t[0], system->N_cm );
+        fclose( fp );
+    }
+#endif
+
+    switch ( control->charge_method )
+    {
+    case QEQ_CM:
+        QEq( system, control, data, workspace, far_nbrs, out_control );
+        break;
+
+    case EE_CM:
+        EE( system, control, data, workspace, far_nbrs, out_control );
+        break;
+
+    case ACKS2_CM:
+        ACKS2( system, control, data, workspace, far_nbrs, out_control );
+        break;
+
+    default:
+        fprintf( stderr, "Invalid charge method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+#if defined(DEBUG_FOCUS)
+    if ( data->step >= 100 )
+    {
+        sprintf( fname, "H_%d_%s.out", data->step, control->sim_name );
+        Print_Sparse_Matrix2( workspace->H, fname, NULL );
+//        Print_Sparse_Matrix_Binary( workspace->H, fname );
+
+        sprintf( fname, "b_s_%d_%s.out", data->step, control->sim_name );
+        fp = fopen( fname, "w" );
+        Vector_Print( fp, NULL, workspace->b_s, system->N_cm );
+        fclose( fp );
+
+        sprintf( fname, "b_t_%d_%s.out", data->step, control->sim_name );
+        fp = fopen( fname, "w" );
+        Vector_Print( fp, NULL, workspace->b_t, system->N_cm );
+        fclose( fp );
+    }
+#endif
+}
diff --git a/sPuReMD/src/QEq.h b/sPuReMD/src/charges.h
similarity index 88%
rename from sPuReMD/src/QEq.h
rename to sPuReMD/src/charges.h
index c7186aaee64b35ab67a6fe590de525d99db08c32..50f563c3e94fc5e8a5c324d2d3467572c6ea930d 100644
--- a/sPuReMD/src/QEq.h
+++ b/sPuReMD/src/charges.h
@@ -19,12 +19,12 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __QEq_H_
-#define __QEq_H_
+#ifndef __CHARGES_H_
+#define __CHARGES_H_
 
 #include "mytypes.h"
 
-void QEq( reax_system* const, control_params* const, simulation_data* const,
+void Compute_Charges( reax_system* const, control_params* const, simulation_data* const,
           static_storage* const, const list* const,
           const output_controls* const );
 
diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c
index 41f744969f1615ba621d85db852d998e92719b86..ad2d3d21a159fe0bf739a7910b24a795a3dfd866 100644
--- a/sPuReMD/src/control.c
+++ b/sPuReMD/src/control.c
@@ -66,20 +66,24 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     control->max_far_nbrs = 1000;
     control->bo_cut = 0.01;
     control->thb_cut = 0.001;
-    control->hb_cut = 7.50;
+    control->hb_cut = 0.0;
 
     control->tabulate = 0;
 
-    control->qeq_solver_type = GMRES_S;
-    control->qeq_solver_q_err = 0.000001;
-    control->qeq_domain_sparsify_enabled = FALSE;
-    control->qeq_domain_sparsity = 1.0;
-    control->pre_comp_type = ICHOLT_PC;
-    control->pre_comp_sweeps = 3;
-    control->pre_comp_refactor = 100;
-    control->pre_comp_droptol = 0.01;
-    control->pre_app_type = TRI_SOLVE_PA;
-    control->pre_app_jacobi_iters = 50;
+    control->charge_method = QEQ_CM;
+    control->cm_q_net = 0.0;
+    control->cm_solver_type = GMRES_S;
+    control->cm_solver_max_iters = 100;
+    control->cm_solver_restart = 50;
+    control->cm_solver_q_err = 0.000001;
+    control->cm_domain_sparsify_enabled = FALSE;
+    control->cm_domain_sparsity = 1.0;
+    control->cm_solver_pre_comp_type = ICHOLT_PC;
+    control->cm_solver_pre_comp_sweeps = 3;
+    control->cm_solver_pre_comp_refactor = 100;
+    control->cm_solver_pre_comp_droptol = 0.01;
+    control->cm_solver_pre_app_type = TRI_SOLVE_PA;
+    control->cm_solver_pre_app_jacobi_iters = 50;
 
     control->T_init = 0.;
     control->T_final = 300.;
@@ -91,9 +95,9 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     control->P[0] = 0.000101325;
     control->P[1] = 0.000101325;
     control->P[2] = 0.000101325;
-    control->Tau_P[0]  = 500.0;
-    control->Tau_P[1]  = 500.0;
-    control->Tau_P[2]  = 500.0;
+    control->Tau_P[0] = 500.0;
+    control->Tau_P[1] = 500.0;
+    control->Tau_P[2] = 500.0;
     control->Tau_PT = 500.0;
     control->compressibility = 1.0;
     control->press_mode = 0;
@@ -101,9 +105,9 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     control->remove_CoM_vel = 25;
 
     out_control->debug_level = 0;
-    out_control->energy_update_freq = 10;
+    out_control->energy_update_freq = 0;
 
-    out_control->write_steps = 100;
+    out_control->write_steps = 0;
     out_control->traj_compress = 0;
     out_control->write = fprintf;
     out_control->traj_format = 0;
@@ -135,8 +139,10 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     /* memory allocations */
     s = (char*) malloc(sizeof(char) * MAX_LINE);
     tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
-    for (i = 0; i < MAX_TOKENS; i++)
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
         tmp[i] = (char*) malloc(sizeof(char) * MAX_LINE);
+    }
 
     /* read control parameters file */
     while (fgets(s, MAX_LINE, fp))
@@ -240,51 +246,74 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             val = atof( tmp[1] );
             control->hb_cut = val;
         }
-        else if ( strcmp(tmp[0], "qeq_solver_type") == 0 )
+        else if ( strcmp(tmp[0], "charge_method") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->charge_method = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_q_net") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_q_net = val;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_type = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_max_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_max_iters = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_restart") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->qeq_solver_type = ival;
+            control->cm_solver_restart = ival;
         }
-        else if ( strcmp(tmp[0], "qeq_solver_q_err") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_q_err") == 0 )
         {
             val = atof( tmp[1] );
-            control->qeq_solver_q_err = val;
+            control->cm_solver_q_err = val;
         }
-        else if ( strcmp(tmp[0], "qeq_domain_sparsity") == 0 )
+        else if ( strcmp(tmp[0], "cm_domain_sparsity") == 0 )
         {
             val = atof( tmp[1] );
-            control->qeq_domain_sparsity = val;
-            control->qeq_domain_sparsify_enabled = TRUE;
+            control->cm_domain_sparsity = val;
+            if ( val < 1.0 )
+            {
+                control->cm_domain_sparsify_enabled = TRUE;
+            }
         }
-        else if ( strcmp(tmp[0], "pre_comp_type") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_comp_type = ival;
+            control->cm_solver_pre_comp_type = ival;
         }
-        else if ( strcmp(tmp[0], "pre_comp_refactor") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_refactor") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_comp_refactor = ival;
+            control->cm_solver_pre_comp_refactor = ival;
         }
-        else if ( strcmp(tmp[0], "pre_comp_droptol") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_droptol") == 0 )
         {
             val = atof( tmp[1] );
-            control->pre_comp_droptol = val;
+            control->cm_solver_pre_comp_droptol = val;
         }
-        else if ( strcmp(tmp[0], "pre_comp_sweeps") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_sweeps") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_comp_sweeps = ival;
+            control->cm_solver_pre_comp_sweeps = ival;
         }
-        else if ( strcmp(tmp[0], "pre_app_type") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_app_type = ival;
+            control->cm_solver_pre_app_type = ival;
         }
-        else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_jacobi_iters") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_app_jacobi_iters = ival;
+            control->cm_solver_pre_app_jacobi_iters = ival;
         }
         else if ( strcmp(tmp[0], "temp_init") == 0 )
         {
@@ -292,7 +321,9 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             control->T_init = val;
 
             if ( control->T_init < 0.001 )
+            {
                 control->T_init = 0.001;
+            }
         }
         else if ( strcmp(tmp[0], "temp_final") == 0 )
         {
@@ -300,12 +331,15 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             control->T_final = val;
 
             if ( control->T_final < 0.1 )
+            {
                 control->T_final = 0.1;
+            }
         }
         else if ( strcmp(tmp[0], "t_mass") == 0 )
         {
             val = atof(tmp[1]);
-            control->Tau_T = val * 1.e-3;    // convert t_mass from fs to ps
+            /* convert t_mass from fs to ps */
+            control->Tau_T = val * 1.e-3;
         }
         else if ( strcmp(tmp[0], "t_mode") == 0 )
         {
@@ -522,15 +556,19 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
 
     /* determine target T */
     if ( control->T_mode == 0 )
+    {
         control->T = control->T_final;
-    else control->T = control->T_init;
-
+    }
+    else
+    {
+        control->T = control->T_init;
+    }
 
     /* near neighbor and far neighbor cutoffs */
     control->bo_cut = 0.01 * system->reaxprm.gp.l[29];
     control->r_low  = system->reaxprm.gp.l[11];
     control->r_cut  = system->reaxprm.gp.l[12];
-    control->r_sp_cut  = control->r_cut * control->qeq_domain_sparsity;
+    control->r_sp_cut  = control->r_cut * control->cm_domain_sparsity;
     control->vlist_cut += control->r_cut;
 
     system->g.cell_size = control->vlist_cut / 2.;
diff --git a/sPuReMD/src/ffield.c b/sPuReMD/src/ffield.c
index 088e1acbe2aa7588cb6e40b19626038d4257e56e..a39aacf1588d6a53e0587cb1d616a1282e98b472 100644
--- a/sPuReMD/src/ffield.c
+++ b/sPuReMD/src/ffield.c
@@ -33,11 +33,12 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     int c, i, j, k, l, m, n, o, p, cnt;
     real val;
 
-    s = (char*) malloc(sizeof(char) * MAX_LINE);
-    tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
+    s = (char*) malloc( sizeof(char) * MAX_LINE );
+    tmp = (char**) malloc( sizeof(char*) * MAX_TOKENS );
     for (i = 0; i < MAX_TOKENS; i++)
-        tmp[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN);
-
+    {
+        tmp[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
+    }
 
     /* reading first header comment */
     fgets( s, MAX_LINE, fp );
@@ -56,7 +57,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     }
 
     reax->gp.n_global = n;
-    reax->gp.l = (real*) malloc(sizeof(real) * n);
+    reax->gp.l = (real*) malloc( sizeof(real) * n );
 
     /* see mytypes.h for mapping between l[i] and the lambdas used in ff */
     for (i = 0; i < n; i++)
@@ -206,6 +207,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         val = atof(tmp[5]);
         reax->sbp[i].b_o_133    = val;
         val = atof(tmp[6]);
+        reax->sbp[i].b_s_acks2  = val;
         val = atof(tmp[7]);
 
         /* line 4  */
@@ -396,55 +398,55 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
                                    (reax->sbp[j].r_pi_pi + reax->sbp[i].r_pi_pi);
 
             reax->tbp[i][j].p_boc3 =
-                sqrt(reax->sbp[i].b_o_132 *
+                SQRT(reax->sbp[i].b_o_132 *
                      reax->sbp[j].b_o_132);
 
             reax->tbp[j][i].p_boc3 =
-                sqrt(reax->sbp[j].b_o_132 *
+                SQRT(reax->sbp[j].b_o_132 *
                      reax->sbp[i].b_o_132);
 
             reax->tbp[i][j].p_boc4 =
-                sqrt(reax->sbp[i].b_o_131 *
+                SQRT(reax->sbp[i].b_o_131 *
                      reax->sbp[j].b_o_131);
             reax->tbp[j][i].p_boc4 =
-                sqrt(reax->sbp[j].b_o_131 *
+                SQRT(reax->sbp[j].b_o_131 *
                      reax->sbp[i].b_o_131);
 
             reax->tbp[i][j].p_boc5 =
-                sqrt(reax->sbp[i].b_o_133 *
+                SQRT(reax->sbp[i].b_o_133 *
                      reax->sbp[j].b_o_133);
             reax->tbp[j][i].p_boc5 =
-                sqrt(reax->sbp[j].b_o_133 *
+                SQRT(reax->sbp[j].b_o_133 *
                      reax->sbp[i].b_o_133);
 
             reax->tbp[i][j].D =
-                sqrt(reax->sbp[i].epsilon *
+                SQRT(reax->sbp[i].epsilon *
                      reax->sbp[j].epsilon);
 
             reax->tbp[j][i].D =
-                sqrt(reax->sbp[j].epsilon *
+                SQRT(reax->sbp[j].epsilon *
                      reax->sbp[i].epsilon);
 
             reax->tbp[i][j].alpha =
-                sqrt(reax->sbp[i].alpha *
+                SQRT(reax->sbp[i].alpha *
                      reax->sbp[j].alpha);
 
             reax->tbp[j][i].alpha =
-                sqrt(reax->sbp[j].alpha *
+                SQRT(reax->sbp[j].alpha *
                      reax->sbp[i].alpha);
 
             reax->tbp[i][j].r_vdW =
-                2.0 * sqrt(reax->sbp[i].r_vdw * reax->sbp[j].r_vdw);
+                2.0 * SQRT(reax->sbp[i].r_vdw * reax->sbp[j].r_vdw);
 
             reax->tbp[j][i].r_vdW =
-                2.0 * sqrt(reax->sbp[j].r_vdw * reax->sbp[i].r_vdw);
+                2.0 * SQRT(reax->sbp[j].r_vdw * reax->sbp[i].r_vdw);
 
             reax->tbp[i][j].gamma_w =
-                sqrt(reax->sbp[i].gamma_w *
+                SQRT(reax->sbp[i].gamma_w *
                      reax->sbp[j].gamma_w);
 
             reax->tbp[j][i].gamma_w =
-                sqrt(reax->sbp[j].gamma_w *
+                SQRT(reax->sbp[j].gamma_w *
                      reax->sbp[i].gamma_w);
 
             reax->tbp[i][j].gamma =
@@ -716,7 +718,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
 
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
+    {
         free( tmp[i] );
+    }
     free( tmp );
     free( s );
 
@@ -726,7 +730,9 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         for ( j = 0; j < reax->num_atom_types; j++ )
         {
             for ( k = 0; k < reax->num_atom_types; k++ )
+            {
                 free( tor_flag[i][j][k] );
+            }
 
             free( tor_flag[i][j] );
         }
@@ -734,6 +740,8 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         free( tor_flag[i] );
     }
 
+    free( tor_flag );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "force field read\n" );
 #endif
diff --git a/sPuReMD/src/forces.c b/sPuReMD/src/forces.c
index 9108a8dd4026e001716f707302e40d671d48b838..b5c1a8304ed6c0f907b47b12d1f421fce6320284 100644
--- a/sPuReMD/src/forces.c
+++ b/sPuReMD/src/forces.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "forces.h"
+
 #include "box.h"
 #include "bond_orders.h"
 #include "single_body_interactions.h"
@@ -29,13 +30,24 @@
 #include "list.h"
 #include "print_utils.h"
 #include "system_props.h"
-#include "QEq.h"
+#include "charges.h"
 #include "vector.h"
 
 
+/* File scope variables */
+static interaction_function Interaction_Functions[NO_OF_INTERACTIONS];
+
+
+typedef enum
+{
+    DIAGONAL = 0,
+    OFF_DIAGONAL = 1,
+} MATRIX_ENTRY_POSITION;
+
+
 void Dummy_Interaction( reax_system *system, control_params *control,
-                        simulation_data *data, static_storage *workspace,
-                        list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
 }
 
@@ -48,9 +60,14 @@ void Init_Bonded_Force_Functions( control_params *control )
     //*/Dummy_Interaction;
     Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
     Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
-    if ( control->hb_cut > 0 )
+    if ( control->hb_cut > 0.0 )
+    {
         Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
-    else Interaction_Functions[5] = Dummy_Interaction;
+    }
+    else
+    {
+        Interaction_Functions[5] = Dummy_Interaction;
+    }
     Interaction_Functions[6] = Dummy_Interaction; //empty
     Interaction_Functions[7] = Dummy_Interaction; //empty
     Interaction_Functions[8] = Dummy_Interaction; //empty
@@ -59,12 +76,12 @@ void Init_Bonded_Force_Functions( control_params *control )
 
 
 void Compute_Bonded_Forces( reax_system *system, control_params *control,
-                            simulation_data *data, static_storage *workspace,
-                            list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
 
     int i;
-    // real t_start, t_end, t_elapsed;
+    //real t_start, t_end, t_elapsed;
 
 #ifdef TEST_ENERGY
     /* Mark beginning of a new timestep in each energy file */
@@ -99,11 +116,13 @@ void Compute_Bonded_Forces( reax_system *system, control_params *control,
     /* Implement all the function calls as function pointers */
     for ( i = 0; i < NO_OF_INTERACTIONS; i++ )
     {
-        (Interaction_Functions[i])(system, control, data, workspace,
-                                   lists, out_control);
+        (Interaction_Functions[i])( system, control, data, workspace,
+                lists, out_control );
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "f%d-", i );
 #endif
+
 #ifdef TEST_FORCES
         (Print_Interactions[i])(system, control, data, workspace,
                                 lists, out_control);
@@ -113,10 +132,11 @@ void Compute_Bonded_Forces( reax_system *system, control_params *control,
 
 
 void Compute_NonBonded_Forces( reax_system *system, control_params *control,
-                               simulation_data *data, static_storage *workspace,
-                               list** lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
+
 #ifdef TEST_ENERGY
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
              data->step, "atom1", "atom2", "r12", "evdw", "total" );
@@ -125,9 +145,10 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 #endif
 
     t_start = Get_Time( );
-    QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
+    Compute_Charges( system, control, data, workspace, lists[FAR_NBRS], out_control );
     t_elapsed = Get_Timing_Info( t_start );
-    data->timing.QEq += t_elapsed;
+    data->timing.cm += t_elapsed;
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "qeq - " );
 #endif
@@ -139,7 +160,7 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
     else
     {
         Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
-                                      lists, out_control );
+                lists, out_control );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonb forces - " );
@@ -147,7 +168,7 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 
 #ifdef TEST_FORCES
     Print_vdW_Coulomb_Forces( system, control, data, workspace,
-                              lists, out_control );
+            lists, out_control );
 #endif
 }
 
@@ -155,26 +176,61 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 /* This version of Compute_Total_Force computes forces from coefficients
    accumulated by all interaction functions. Saves enormous time & space! */
 void Compute_Total_Force( reax_system *system, control_params *control,
-                          simulation_data *data, static_storage *workspace,
-                          list **lists )
+        simulation_data *data, static_storage *workspace, list **lists )
 {
-    int i, pj;
-    list *bonds = (*lists) + BONDS;
+    int i;
+    list *bonds;
 
-    for ( i = 0; i < system->N; ++i )
-        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-            if ( i < bonds->select.bond_list[pj].nbr )
+    bonds = (*lists) + BONDS;
+
+#ifdef _OPENMP
+    #pragma omp parallel default(shared)
+#endif
+    {
+        int pj;
+#ifdef _OPENMP
+        int j;
+#endif
+
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
             {
-                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
-                    Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-                else
-                    Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
+                if ( i < bonds->select.bond_list[pj].nbr )
+                {
+                    if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
+                    {
+                        Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
+                    }
+                    else
+                    {
+                        Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
+                    }
+                }
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+
+        #pragma omp for schedule(static)
+        for ( i = 0; i < system->N; ++i )
+        {
+            for ( j = 0; j < control->num_threads; ++j )
+            {
+                rvec_Add( system->atoms[i].f, workspace->f_local[j * system->N + i] );
             }
+        }
+#endif
+    }
 }
 
 
 void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
-                     int Hmax, int Htop, int num_bonds, int num_hbonds )
+        int Hmax, int Htop, int num_bonds, int num_hbonds )
 {
     int i, flag;
     list *bonds, *hbonds;
@@ -199,12 +255,16 @@ void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
     flag = -1;
     workspace->realloc.num_bonds = num_bonds;
     for ( i = 0; i < n - 1; ++i )
+    {
         if ( End_Index(i, bonds) >= Start_Index(i + 1, bonds) - 2 )
         {
             workspace->realloc.bonds = 1;
             if ( End_Index(i, bonds) > Start_Index(i + 1, bonds) )
+            {
                 flag = i;
+            }
         }
+    }
 
     if ( flag > -1 )
     {
@@ -232,13 +292,17 @@ void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
         flag = -1;
         workspace->realloc.num_hbonds = num_hbonds;
         for ( i = 0; i < workspace->num_H - 1; ++i )
+        {
             if ( Num_Entries(i, hbonds) >=
                     (Start_Index(i + 1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE )
             {
                 workspace->realloc.hbonds = 1;
                 if ( End_Index(i, hbonds) > Start_Index(i + 1, hbonds) )
+                {
                     flag = i;
+                }
             }
+        }
 
         if ( flag > -1 )
         {
@@ -263,9 +327,379 @@ void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
 }
 
 
+static inline real Init_Charge_Matrix_Entry_Tab( reax_system *system,
+        control_params *control, int i, int j,
+        real r_ij, MATRIX_ENTRY_POSITION pos )
+{
+    int r;
+    real base, dif, val, ret = 0.0;
+    LR_lookup_table *t;
+
+    switch ( control->charge_method )
+    {
+    case QEQ_CM:
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+                t = &( LR
+                        [MIN( system->atoms[i].type, system->atoms[j].type )]
+                        [MAX( system->atoms[i].type, system->atoms[j].type )] );
+
+                /* cubic spline interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if ( r == 0 )  ++r;
+                base = (real)(r + 1) * t->dx;
+                dif = r_ij - base;
+                val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                      t->ele[r].a;
+                val *= EV_to_KCALpMOL / C_ele;
+
+                ret = ((i == j) ? 0.5 : 1.0) * val;
+            break;
+            case DIAGONAL:
+                ret = system->reaxprm.sbp[system->atoms[i].type].eta;
+            break;
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    case EE_CM:
+        //TODO
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+            break;
+            case DIAGONAL:
+            break;
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    case ACKS2_CM:
+        //TODO
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+            break;
+            case DIAGONAL:
+            break;
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    default:
+        fprintf( stderr, "Invalid charge method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    return ret;
+}
+
+
+static inline real Init_Charge_Matrix_Entry( reax_system *system,
+        control_params *control, int i, int j,
+        real r_ij, MATRIX_ENTRY_POSITION pos )
+{
+    real Tap, gamij, dr3gamij_1, dr3gamij_3, ret;
+
+    ret = 0.0;
+
+    switch ( control->charge_method )
+    {
+    case QEQ_CM:
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;
+
+                /* shielding */
+                dr3gamij_1 = ( r_ij * r_ij * r_ij +
+                        system->reaxprm.tbp[system->atoms[i].type][system->atoms[j].type].gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
+                ret = ((i == j) ? 0.5 : 1.0) * Tap * EV_to_KCALpMOL / dr3gamij_3;
+            break;
+
+            case DIAGONAL:
+                ret = system->reaxprm.sbp[system->atoms[i].type].eta;
+            break;
+
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    case EE_CM:
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+                if ( r_ij < control->r_cut && r_ij > 0.001 )
+                {
+                    Tap = control->Tap7 * r_ij + control->Tap6;
+                    Tap = Tap * r_ij + control->Tap5;
+                    Tap = Tap * r_ij + control->Tap4;
+                    Tap = Tap * r_ij + control->Tap3;
+                    Tap = Tap * r_ij + control->Tap2;
+                    Tap = Tap * r_ij + control->Tap1;
+                    Tap = Tap * r_ij + control->Tap0;
+
+                    gamij = SQRT( system->reaxprm.sbp[system->atoms[i].type].gamma
+                            * system->reaxprm.sbp[system->atoms[j].type].gamma );
+                    /* shielding */
+                    dr3gamij_1 = POW( r_ij, 3.0 ) + 1.0 / POW( gamij, 3.0 );
+                    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
+                    ret = Tap * EV_to_KCALpMOL / dr3gamij_3;
+                }
+            break;
+
+            case DIAGONAL:
+                ret = system->reaxprm.sbp[system->atoms[i].type].eta;
+            break;
+
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    case ACKS2_CM:
+        switch ( pos )
+        {
+            case OFF_DIAGONAL:
+                if ( r_ij < control->r_cut && r_ij > 0.001 )
+                {
+                    Tap = control->Tap7 * r_ij + control->Tap6;
+                    Tap = Tap * r_ij + control->Tap5;
+                    Tap = Tap * r_ij + control->Tap4;
+                    Tap = Tap * r_ij + control->Tap3;
+                    Tap = Tap * r_ij + control->Tap2;
+                    Tap = Tap * r_ij + control->Tap1;
+                    Tap = Tap * r_ij + control->Tap0;
+
+                    gamij = SQRT( system->reaxprm.sbp[system->atoms[i].type].gamma
+                            * system->reaxprm.sbp[system->atoms[j].type].gamma );
+                    /* shielding */
+                    dr3gamij_1 = POW( r_ij, 3.0 ) + 1.0 / POW( gamij, 3.0 );
+                    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
+                    ret = Tap * EV_to_KCALpMOL / dr3gamij_3;
+                }
+            break;
+
+            case DIAGONAL:
+                ret = system->reaxprm.sbp[system->atoms[i].type].eta;
+            break;
+
+            default:
+                fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
+                exit( INVALID_INPUT );
+            break;
+        }
+        break;
+
+    default:
+        fprintf( stderr, "Invalid charge method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    return ret;
+}
+
+
+static void Init_Charge_Matrix_Remaining_Entries( reax_system *system,
+        control_params *control, list *far_nbrs,
+        sparse_matrix * H, sparse_matrix * H_sp,
+        int * Htop, int * H_sp_top )
+{
+    int i, j, pj;
+    real d, xcut, bond_softness, * X_diag;
+
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            break;
+
+        case EE_CM:
+            H->start[system->N_cm - 1] = *Htop;
+            H_sp->start[system->N_cm - 1] = *H_sp_top;
+
+            for ( i = 0; i < system->N_cm - 1; ++i )
+            {
+                H->j[*Htop] = i;
+                H->val[*Htop] = 1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = 1.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->j[*Htop] = system->N_cm - 1;
+            H->val[*Htop] = 0.0;
+            *Htop = *Htop + 1;
+
+            H_sp->j[*H_sp_top] = system->N_cm - 1;
+            H_sp->val[*H_sp_top] = 0.0;
+            *H_sp_top = *H_sp_top + 1;
+            break;
+
+        case ACKS2_CM:
+            if ( (X_diag = (real*) calloc(system->N, sizeof(real))) == NULL )
+            {
+                fprintf( stderr, "not enough memory for charge matrix. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+
+            H->start[system->N] = *Htop;
+            H_sp->start[system->N] = *H_sp_top;
+
+            for ( i = 0; i < system->N; ++i )
+            {
+                H->j[*Htop] = i;
+                H->val[*Htop] = -1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = -1.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->j[*Htop] = system->N;
+            H->val[*Htop] = 0.0;
+            *Htop = *Htop + 1;
+
+            H_sp->j[*H_sp_top] = system->N;
+            H_sp->val[*H_sp_top] = 0.0;
+            *H_sp_top = *H_sp_top + 1;
+
+            for ( i = 0; i < system->N; ++i )
+            {
+                H->start[system->N + i + 1] = *Htop;
+                H_sp->start[system->N + i + 1] = *H_sp_top;
+
+                H->j[*Htop] = i;
+                H->val[*Htop] = -1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = -1.0;
+                *H_sp_top = *H_sp_top + 1;
+
+                for ( pj = Start_Index(i, far_nbrs); pj < End_Index(i, far_nbrs); ++pj )
+                {
+                    if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+                    {
+                        j = far_nbrs->select.far_nbr_list[pj].nbr;
+
+                        xcut = ( system->reaxprm.sbp[ system->atoms[i].type ].b_s_acks2
+                                + system->reaxprm.sbp[ system->atoms[j].type ].b_s_acks2 )
+                            / 2.0;
+
+                        if ( far_nbrs->select.far_nbr_list[pj].d < xcut &&
+                                far_nbrs->select.far_nbr_list[pj].d > 0.001 )
+                        {
+                            d = far_nbrs->select.far_nbr_list[pj].d / xcut;
+                            bond_softness = system->reaxprm.gp.l[34] * POW( d, 3.0 ) * POW( 1.0 - d, 6.0 );
+
+                            H->j[*Htop] = system->N + j + 1;
+                            H->val[*Htop] = MAX( 0.0, bond_softness );
+                            *Htop = *Htop + 1;
+
+                            H_sp->j[*H_sp_top] = system->N + j + 1;
+                            H_sp->val[*H_sp_top] = MAX( 0.0, bond_softness );
+                            *H_sp_top = *H_sp_top + 1;
+
+                            X_diag[i] -= bond_softness;
+                            X_diag[j] -= bond_softness;
+                        }
+                    }
+                }
+
+                H->j[*Htop] = system->N + i + 1;
+                H->val[*Htop] = 0.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = system->N + i + 1;
+                H_sp->val[*H_sp_top] = 0.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->start[system->N_cm - 1] = *Htop;
+            H_sp->start[system->N_cm - 1] = *H_sp_top;
+
+            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
+            {
+                for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
+                {
+                    if ( H->j[pj] == i )
+                    {
+                        H->val[pj] = X_diag[i - system->N - 1];
+                        break;
+                    }
+                }
+
+                for ( pj = H_sp->start[i]; pj < H_sp->start[i + 1]; ++pj )
+                {
+                    if ( H_sp->j[pj] == i )
+                    {
+                        H_sp->val[pj] = X_diag[i - system->N - 1];
+                        break;
+                    }
+                }
+            }
+
+            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
+            {
+                H->j[*Htop] = i;
+                H->val[*Htop] = -1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = -1.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->j[*Htop] = system->N_cm - 1;
+            H->val[*Htop] = 0.0;
+            *Htop = *Htop + 1;
+
+            H_sp->j[*H_sp_top] = system->N_cm - 1;
+            H_sp->val[*H_sp_top] = 0.0;
+            *H_sp_top = *H_sp_top + 1;
+
+            free( X_diag );
+            break;
+
+        default:
+            break;
+    }
+}
+
+
 void Init_Forces( reax_system *system, control_params *control,
-                  simulation_data *data, static_storage *workspace,
-                  list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -273,19 +707,15 @@ void Init_Forces( reax_system *system, control_params *control,
     int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
     int ihb, jhb, ihb_top, jhb_top;
     int flag, flag_sp;
-    real r_ij, r2, self_coef;
-    real dr3gamij_1, dr3gamij_3, Tap;
-    //real val, dif, base;
+    real r_ij, r2;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;
     sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     far_neighbor_data *nbr_pj;
-    //LR_lookup_table *t;
     reax_atom *atom_i, *atom_j;
     bond_data *ibond, *jbond;
     bond_order_data *bo_ij, *bo_ji;
@@ -293,7 +723,6 @@ void Init_Forces( reax_system *system, control_params *control,
     far_nbrs = *lists + FAR_NBRS;
     bonds = *lists + BONDS;
     hbonds = *lists + HBONDS;
-
     H = workspace->H;
     H_sp = workspace->H_sp;
     Htop = 0;
@@ -301,8 +730,6 @@ void Init_Forces( reax_system *system, control_params *control,
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
-    p_boc1 = system->reaxprm.gp.l[0];
-    p_boc2 = system->reaxprm.gp.l[1];
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -315,8 +742,11 @@ void Init_Forces( reax_system *system, control_params *control,
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
+
         if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
+        {
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+        }
 
         for ( pj = start_i; pj < end_i; ++pj )
         {
@@ -359,22 +789,10 @@ void Init_Forces( reax_system *system, control_params *control,
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
                 twbp = &(system->reaxprm.tbp[type_i][type_j]);
-                self_coef = (i == j) ? 0.5 : 1.0;
-
-                /* H matrix entry */
-                Tap = control->Tap7 * r_ij + control->Tap6;
-                Tap = Tap * r_ij + control->Tap5;
-                Tap = Tap * r_ij + control->Tap4;
-                Tap = Tap * r_ij + control->Tap3;
-                Tap = Tap * r_ij + control->Tap2;
-                Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;
-
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
 
                 H->j[Htop] = j;
-                H->val[Htop] = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+                H->val[Htop] = Init_Charge_Matrix_Entry( system, control, i, j, 
+                        r_ij, OFF_DIAGONAL );
                 ++Htop;
 
                 /* H_sp matrix entry */
@@ -413,28 +831,40 @@ void Init_Forces( reax_system *system, control_params *control,
                 /* uncorrected bond orders */
                 if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
                 {
-                    r2 = SQR(r_ij);
+                    r2 = SQR( r_ij );
 
                     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                     {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
-                    else BO_s = C12 = 0.0;
+                    else
+                    {
+                        BO_s = 0.0;
+                        C12 = 0.0;
+                    }
 
                     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                     {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
-                    else BO_pi = C34 = 0.0;
+                    else
+                    {
+                        BO_pi = 0.0;
+                        C34 = 0.0;
+                    }
 
                     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                     {
                         C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                         BO_pi2 = EXP( C56 );
                     }
-                    else BO_pi2 = C56 = 0.0;
+                    else
+                    {
+                        BO_pi2 = 0.0;
+                        C56 = 0.0;
+                    }
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
@@ -476,13 +906,13 @@ void Init_Forces( reax_system *system, control_params *control,
 
                         /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi2,
-                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
-                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+                        rvec_Scale( bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
+                        rvec_Scale( bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
+                        rvec_Scale( bo_ij->dln_BOp_pi2,
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
+                        rvec_Scale( bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s );
+                        rvec_Scale( bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                        rvec_Scale( bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
                         /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
@@ -501,8 +931,12 @@ void Init_Forces( reax_system *system, control_params *control,
                         bo_ji->BO -= control->bo_cut;
                         workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
                         workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                        bo_ij->Cdbo = 0.0;
+                        bo_ij->Cdbopi = 0.0;
+                        bo_ij->Cdbopi2 = 0.0;
+                        bo_ji->Cdbo = 0.0;
+                        bo_ji->Cdbopi = 0.0;
+                        bo_ji->Cdbopi2 = 0.0;
 
                         /*fprintf( stderr, "%d %d %g %g %g\n",
                           i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
@@ -541,30 +975,31 @@ void Init_Forces( reax_system *system, control_params *control,
 
         /* diagonal entry */
         H->j[Htop] = i;
-        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
+        H->val[Htop] = Init_Charge_Matrix_Entry( system, control, i, i,
+                r_ij, DIAGONAL );
         ++Htop;
 
-        /* diagonal entry */
         H_sp->j[H_sp_top] = i;
         H_sp->val[H_sp_top] = H->val[Htop - 1];
         ++H_sp_top;
 
         Set_End_Index( i, btop_i, bonds );
         if ( ihb == 1 )
+        {
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-        //fprintf( stderr, "%d bonds start: %d, end: %d\n",
-        //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+        }
     }
 
-//    printf("Htop = %d\n", Htop);
-//    printf("H_sp_top = %d\n", H_sp_top);
+    Init_Charge_Matrix_Remaining_Entries( system, control, far_nbrs,
+            H, H_sp, &Htop, &H_sp_top );
 
     // mark the end of j list
-    H->start[i] = Htop;
-    H_sp->start[i] = H_sp_top;
+    H->start[system->N_cm] = Htop;
+    H_sp->start[system->N_cm] = H_sp_top;
+
     /* validate lists - decide if reallocation is required! */
     Validate_Lists( workspace, lists,
-                    data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
+            data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
@@ -575,28 +1010,24 @@ void Init_Forces( reax_system *system, control_params *control,
 
 
 void Init_Forces_Tab( reax_system *system, control_params *control,
-                      simulation_data *data, static_storage *workspace,
-                      list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
     int Htop, H_sp_top, btop_i, btop_j, num_bonds, num_hbonds;
-    int tmin, tmax, r;
     int ihb, jhb, ihb_top, jhb_top;
     int flag, flag_sp;
-    real r_ij, r2, self_coef;
-    real val, dif, base;
+    real r_ij, r2;
     real C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;
     sparse_matrix *H, *H_sp;
     list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     far_neighbor_data *nbr_pj;
-    LR_lookup_table *t;
     reax_atom *atom_i, *atom_j;
     bond_data *ibond, *jbond;
     bond_order_data *bo_ij, *bo_ji;
@@ -612,8 +1043,6 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = btop_j = 0;
-    p_boc1 = system->reaxprm.gp.l[0];
-    p_boc2 = system->reaxprm.gp.l[1];
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -660,7 +1089,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 {
                     flag_sp = 1;
                 }
-                nbr_pj->d = sqrt(nbr_pj->d);
+                nbr_pj->d = SQRT( nbr_pj->d );
                 flag = 1;
             }
 
@@ -670,22 +1099,10 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 r_ij = nbr_pj->d;
                 sbp_j = &(system->reaxprm.sbp[type_j]);
                 twbp = &(system->reaxprm.tbp[type_i][type_j]);
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( LR[tmin][tmax] );
-
-                /* cubic spline interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if ( r == 0 )  ++r;
-                base = (real)(r + 1) * t->dx;
-                dif = r_ij - base;
-                val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
-                      t->ele[r].a;
-                val *= EV_to_KCALpMOL / C_ele;
 
                 H->j[Htop] = j;
-                H->val[Htop] = self_coef * val;
+                H->val[Htop] = Init_Charge_Matrix_Entry_Tab( system, control, i, j, 
+                        r_ij, OFF_DIAGONAL );
                 ++Htop;
 
                 /* H_sp matrix entry */
@@ -724,28 +1141,40 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 /* uncorrected bond orders */
                 if ( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut )
                 {
-                    r2 = SQR(r_ij);
+                    r2 = SQR( r_ij );
 
                     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                     {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                     }
-                    else BO_s = C12 = 0.0;
+                    else
+                    {
+                        BO_s = 0.0;
+                        C12 = 0.0;
+                    }
 
                     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                     {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
                     }
-                    else BO_pi = C34 = 0.0;
+                    else
+                    {
+                        BO_pi = 0.0;
+                        C34 = 0.0;
+                    }
 
                     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                     {
                         C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                         BO_pi2 = EXP( C56 );
                     }
-                    else BO_pi2 = C56 = 0.0;
+                    else
+                    {
+                        BO_pi2 = 0.0;
+                        C56 = 0.0;
+                    }
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
@@ -788,13 +1217,13 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
                         /* Only dln_BOp_xx wrt. dr_i is stored here, note that
                            dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi2,
-                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
-                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+                        rvec_Scale( bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
+                        rvec_Scale( bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
+                        rvec_Scale( bo_ij->dln_BOp_pi2,
+                                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
+                        rvec_Scale( bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s );
+                        rvec_Scale( bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                        rvec_Scale( bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
 
                         /* Only dBOp wrt. dr_i is stored here, note that
                            dBOp/dr_i = -dBOp/dr_j and all others are 0 */
@@ -813,8 +1242,12 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         bo_ji->BO -= control->bo_cut;
                         workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
                         workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                        bo_ij->Cdbo = 0.0;
+                        bo_ij->Cdbopi = 0.0;
+                        bo_ij->Cdbopi2 = 0.0;
+                        bo_ji->Cdbo = 0.0;
+                        bo_ji->Cdbopi = 0.0;
+                        bo_ji->Cdbopi2 = 0.0;
 
                         Set_End_Index( j, btop_j + 1, bonds );
                     }
@@ -824,48 +1257,48 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
         /* diagonal entry */
         H->j[Htop] = i;
-        H->val[Htop] = system->reaxprm.sbp[type_i].eta;
+        H->val[Htop] = Init_Charge_Matrix_Entry_Tab( system, control, i, j,
+                r_ij, DIAGONAL );
         ++Htop;
 
-        /* diagonal entry */
         H_sp->j[H_sp_top] = i;
         H_sp->val[H_sp_top] = H->val[Htop - 1];
         ++H_sp_top;
 
         Set_End_Index( i, btop_i, bonds );
         if ( ihb == 1 )
+        {
             Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
+        }
     }
 
     // mark the end of j list
     H->start[i] = Htop;
     H_sp->start[i] = H_sp_top;
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists,
-                    data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
+    Validate_Lists( workspace, lists, data->step, system->N,
+            H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
              data->step, Htop, num_bonds, num_hbonds );
     //Print_Bonds( system, bonds, "sbonds.out" );
     //Print_Bond_List2( system, bonds, "sbonds.out" );
-    //Print_Sparse_Matrix2( H, "H.out" );
+    //Print_Sparse_Matrix2( H, "H.out", NULL );
 #endif
 }
 
 
 void Estimate_Storage_Sizes( reax_system *system, control_params *control,
-                             list **lists, int *Htop, int *hb_top,
-                             int *bond_top, int *num_3body )
+        list **lists, int *Htop, int *hb_top, int *bond_top, int *num_3body )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
     int ihb, jhb;
-    real r_ij, r2;
+    real r_ij;
     real C12, C34, C56;
     real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;
     list *far_nbrs;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -873,8 +1306,6 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
     reax_atom *atom_i, *atom_j;
 
     far_nbrs = *lists + FAR_NBRS;
-    p_boc1 = system->reaxprm.gp.l[0];
-    p_boc2 = system->reaxprm.gp.l[1];
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -913,7 +1344,6 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
                 if ( nbr_pj->d <= control->nbr_cut )
                 {
                     r_ij = nbr_pj->d;
-                    r2 = SQR(r_ij);
 
                     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                     {
@@ -962,8 +1392,8 @@ void Estimate_Storage_Sizes( reax_system *system, control_params *control,
 
 
 void Compute_Forces( reax_system *system, control_params *control,
-                     simulation_data *data, static_storage *workspace,
-                     list** lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list** lists, output_controls *out_control )
 {
     real t_start, t_elapsed;
 
@@ -978,6 +1408,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     }
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.init_forces += t_elapsed;
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "init_forces - ");
 #endif
@@ -986,21 +1417,23 @@ void Compute_Forces( reax_system *system, control_params *control,
     Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.bonded += t_elapsed;
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "bonded_forces - ");
 #endif
 
     t_start = Get_Time( );
     Compute_NonBonded_Forces( system, control, data, workspace,
-                              lists, out_control );
+            lists, out_control );
     t_elapsed = Get_Timing_Info( t_start );
     data->timing.nonb += t_elapsed;
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "nonbondeds - ");
 #endif
 
     Compute_Total_Force( system, control, data, workspace, lists );
-    //Print_Total_Force( system, control, data, workspace, lists, out_control );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "totalforces - ");
     //Print_Total_Force( system, control, data, workspace, lists, out_control );
@@ -1010,6 +1443,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     Print_Total_Force( system, control, data, workspace, lists, out_control );
     Compare_Total_Forces( system, control, data, workspace, lists, out_control );
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "forces - ");
 #endif
diff --git a/sPuReMD/src/four_body_interactions.c b/sPuReMD/src/four_body_interactions.c
index 1fc7e99c5e0b7d3738ac04cd23bccbbfba397b52..e0010e8cfc412f1e5224df36dd322eacbc95b64e 100644
--- a/sPuReMD/src/four_body_interactions.c
+++ b/sPuReMD/src/four_body_interactions.c
@@ -29,13 +29,14 @@
 
 #define MIN_SINE 1e-10
 
+
 real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
-                      rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
-                      three_body_interaction_data *p_ijk,
-                      three_body_interaction_data *p_jkl,
-                      rvec dcos_omega_di, rvec dcos_omega_dj,
-                      rvec dcos_omega_dk, rvec dcos_omega_dl,
-                      output_controls *out_control )
+        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk,
+        three_body_interaction_data *p_jkl,
+        rvec dcos_omega_di, rvec dcos_omega_dj,
+        rvec dcos_omega_dk, rvec dcos_omega_dl,
+        output_controls *out_control )
 {
     real unnorm_cos_omega, unnorm_sin_omega, omega;
     real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
@@ -50,7 +51,7 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
 
     /* omega */
     unnorm_cos_omega = -rvec_Dot( dvec_ij, dvec_jk ) * rvec_Dot( dvec_jk, dvec_kl ) +
-                       SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
+        SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
     rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
     unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
     omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
@@ -72,16 +73,24 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
 
 
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-    if ( poem < 1e-20 ) poem = 1e-20;
+    if ( poem < 1e-20 )
+    {
+        poem = 1e-20;
+    }
 
     tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) -
-           2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
-                   r_jk * r_kl * cos_jkl );
+        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl +
+                r_jk * r_kl * cos_jkl );
 
     arg  = tel / poem;
-    if ( arg >  1.0 ) arg =  1.0;
-    if ( arg < -1.0 ) arg = -1.0;
-
+    if ( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if ( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
 
     /*fprintf( out_control->etor,
       "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
@@ -143,525 +152,613 @@ real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
 }
 
 
-
-
-
 void Four_Body_Interactions( reax_system *system, control_params *control,
-                             simulation_data *data, static_storage *workspace,
-                             list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int i, j, k, l, pi, pj, pk, pl, pij, plk;
-    int type_i, type_j, type_k, type_l;
-    int start_j, end_j, start_k, end_k;
-    int start_pj, end_pj, start_pk, end_pk;
-    int num_frb_intrs = 0;
-
-    real Delta_j, Delta_k;
-    real r_ij, r_jk, r_kl, r_li;
-    real BOA_ij, BOA_jk, BOA_kl;
-
-    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
-    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
-    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
-    real fn10, f11_DjDk, dfn11, fn12;
-
-    real theta_ijk, theta_jkl;
-    real sin_ijk, sin_jkl;
-    real cos_ijk, cos_jkl;
-    real tan_ijk_i, tan_jkl_i;
-
-    real omega, cos_omega, cos2omega, cos3omega;
-    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
-
-    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
-    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
-    real Cconj, CEconj1, CEconj2, CEconj3;
-    real CEconj4, CEconj5, CEconj6;
-
-    real e_tor, e_con;
-    rvec dvec_li;
-    rvec force, ext_press;
-    ivec rel_box_jl;
-    // rtensor total_rtensor, temp_rtensor;
-
-    four_body_header *fbh;
-    four_body_parameters *fbp;
-    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
-    bond_order_data *bo_ij, *bo_jk, *bo_kl;
-    three_body_interaction_data *p_ijk, *p_jkl;
-
-    real p_tor2 = system->reaxprm.gp.l[23];
-    real p_tor3 = system->reaxprm.gp.l[24];
-    real p_tor4 = system->reaxprm.gp.l[25];
-    real p_cot2 = system->reaxprm.gp.l[27];
-
-    list *bonds = (*lists) + BONDS;
-    list *thb_intrs = (*lists) + THREE_BODIES;
-
-
-    for ( j = 0; j < system->N; ++j )
+    real p_tor2, p_tor3, p_tor4, p_cot2;
+    list *bonds, *thb_intrs;
+    real e_tor_total, e_con_total;
+
+    p_tor2 = system->reaxprm.gp.l[23];
+    p_tor3 = system->reaxprm.gp.l[24];
+    p_tor4 = system->reaxprm.gp.l[25];
+    p_cot2 = system->reaxprm.gp.l[27];
+    bonds = (*lists) + BONDS;
+    thb_intrs = (*lists) + THREE_BODIES;
+    e_tor_total = 0.0;
+    e_con_total = 0.0;
+
+#ifdef _OPENMP
+    #pragma omp parallel default(shared) reduction(+: e_tor_total, e_con_total)
+#endif
     {
-        type_j = system->atoms[j].type;
-        Delta_j = workspace->Delta_boc[j];
-        start_j = Start_Index(j, bonds);
-        end_j = End_Index(j, bonds);
-
-
-        for ( pk = start_j; pk < end_j; ++pk )
+        int i, j, k, l, pi, pj, pk, pl, pij, plk;
+        int type_i, type_j, type_k, type_l;
+        int start_j, end_j;
+        int start_pj, end_pj, start_pk, end_pk;
+#ifdef TEST_FORCES
+        int num_frb_intrs = 0;
+#endif
+        real Delta_j, Delta_k;
+        real r_ij, r_jk, r_kl, r_li;
+        real BOA_ij, BOA_jk, BOA_kl;
+        real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+        real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+        real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+        real fn10, f11_DjDk, dfn11, fn12;
+        real theta_ijk, theta_jkl;
+        real sin_ijk, sin_jkl;
+        real cos_ijk, cos_jkl;
+        real tan_ijk_i, tan_jkl_i;
+        real omega, cos_omega, cos2omega, cos3omega;
+        rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+        real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+        real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+        real Cconj, CEconj1, CEconj2, CEconj3;
+        real CEconj4, CEconj5, CEconj6;
+        real e_tor, e_con;
+        rvec dvec_li;
+        rvec force, ext_press;
+        ivec rel_box_jl;
+        //rtensor total_rtensor, temp_rtensor;
+        four_body_header *fbh;
+        four_body_parameters *fbp;
+        bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+        bond_order_data *bo_ij, *bo_jk, *bo_kl;
+        three_body_interaction_data *p_ijk, *p_jkl;
+        rvec *f_i, *f_j, *f_k, *f_l;
+#ifdef _OPENMP
+        int tid = omp_get_thread_num( );
+
+        #pragma omp for schedule(static)
+#endif
+        for ( j = 0; j < system->N; ++j )
         {
-            pbond_jk = &( bonds->select.bond_list[pk] );
-            k = pbond_jk->nbr;
-            bo_jk = &( pbond_jk->bo_data );
-            BOA_jk = bo_jk->BO - control->thb_cut;
-
-            /* see if there are any 3-body interactions involving j&k
-            where j is the central atom. Otherwise there is no point in
-             trying to form a 4-body interaction out of this neighborhood */
-            if ( j < k && bo_jk->BO > control->thb_cut/*0*/ &&
-                    Num_Entries(pk, thb_intrs) )
+            type_j = system->atoms[j].type;
+            Delta_j = workspace->Delta_boc[j];
+            start_j = Start_Index(j, bonds);
+            end_j = End_Index(j, bonds);
+#ifdef _OPENMP
+            f_j = &(workspace->f_local[tid * system->N + j]);
+#else
+            f_j = &(system->atoms[j].f);
+#endif
+
+            for ( pk = start_j; pk < end_j; ++pk )
             {
-                start_k = Start_Index(k, bonds);
-                end_k = End_Index(k, bonds);
-                pj = pbond_jk->sym_index; // pj points to j on k's list
+                pbond_jk = &( bonds->select.bond_list[pk] );
+                k = pbond_jk->nbr;
+                bo_jk = &( pbond_jk->bo_data );
+                BOA_jk = bo_jk->BO - control->thb_cut;
+#ifdef _OPENMP
+                f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+                f_k = &(system->atoms[k].f);
+#endif
 
-                /* do the same check as above: are there any 3-body interactions
-                   involving k&j where k is the central atom */
-                if ( Num_Entries(pj, thb_intrs) )
+                /* see if there are any 3-body interactions involving j&k
+                where j is the central atom. Otherwise there is no point in
+                 trying to form a 4-body interaction out of this neighborhood */
+                if ( j < k && bo_jk->BO > control->thb_cut &&
+                        Num_Entries(pk, thb_intrs) )
                 {
-                    type_k = system->atoms[k].type;
-                    Delta_k = workspace->Delta_boc[k];
-                    r_jk = pbond_jk->d;
+                    pj = pbond_jk->sym_index; // pj points to j on k's list
 
-                    start_pk = Start_Index(pk, thb_intrs );
-                    end_pk = End_Index(pk, thb_intrs );
-                    start_pj = Start_Index(pj, thb_intrs );
-                    end_pj = End_Index(pj, thb_intrs );
-
-                    exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
-                    exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
-                    exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
-                    exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
-                    exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
-                    f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+                    /* do the same check as above: are there any 3-body interactions
+                       involving k&j where k is the central atom */
+                    if ( Num_Entries(pj, thb_intrs) )
+                    {
+                        type_k = system->atoms[k].type;
+                        Delta_k = workspace->Delta_boc[k];
+                        r_jk = pbond_jk->d;
+
+                        start_pk = Start_Index(pk, thb_intrs );
+                        end_pk = End_Index(pk, thb_intrs );
+                        start_pj = Start_Index(pj, thb_intrs );
+                        end_pj = End_Index(pj, thb_intrs );
+
+                        exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                        exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                        exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                        exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
+                        exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+                        f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+
+                        /* pick i up from j-k interaction where j is the centre atom */
+                        for ( pi = start_pk; pi < end_pk; ++pi )
+                        {
+                            p_ijk = &( thb_intrs->select.three_body_list[pi] );
+                            pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+                            pbond_ij = &( bonds->select.bond_list[pij] );
+                            bo_ij = &( pbond_ij->bo_data );
 
+                            if ( bo_ij->BO > control->thb_cut/*0*/ )
+                            {
+                                i = p_ijk->thb;
+                                type_i = system->atoms[i].type;
+                                r_ij = pbond_ij->d;
+                                BOA_ij = bo_ij->BO - control->thb_cut;
+
+#ifdef _OPENMP
+                                f_i = &(workspace->f_local[tid * system->N + i]);
+#else
+                                f_i = &(system->atoms[i].f);
+#endif
 
-                    /* pick i up from j-k interaction where j is the centre atom */
-                    for ( pi = start_pk; pi < end_pk; ++pi )
-                    {
-                        p_ijk = &( thb_intrs->select.three_body_list[pi] );
-                        pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
-                        pbond_ij = &( bonds->select.bond_list[pij] );
-                        bo_ij = &( pbond_ij->bo_data );
+                                theta_ijk = p_ijk->theta;
+                                sin_ijk = SIN( theta_ijk );
+                                cos_ijk = COS( theta_ijk );
+                                //tan_ijk_i = 1. / TAN( theta_ijk );
+                                if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+                                {
+                                    tan_ijk_i = cos_ijk / MIN_SINE;
+                                }
+                                else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+                                {
+                                    tan_ijk_i = cos_ijk / -MIN_SINE;
+                                }
+                                else
+                                {
+                                    tan_ijk_i = cos_ijk / sin_ijk;
+                                }
 
+                                exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                                exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
 
-                        if ( bo_ij->BO > control->thb_cut/*0*/ )
-                        {
-                            i = p_ijk->thb;
-                            type_i = system->atoms[i].type;
-                            r_ij = pbond_ij->d;
-                            BOA_ij = bo_ij->BO - control->thb_cut;
-
-                            theta_ijk = p_ijk->theta;
-                            sin_ijk = SIN( theta_ijk );
-                            cos_ijk = COS( theta_ijk );
-                            //tan_ijk_i = 1. / TAN( theta_ijk );
-                            if ( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
-                                tan_ijk_i = cos_ijk / MIN_SINE;
-                            else if ( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
-                                tan_ijk_i = cos_ijk / -MIN_SINE;
-                            else tan_ijk_i = cos_ijk / sin_ijk;
-
-                            exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij - 1.5) );
-
-                            /* pick l up from j-k intr. where k is the centre */
-                            for ( pl = start_pj; pl < end_pj; ++pl )
-                            {
-                                p_jkl = &( thb_intrs->select.three_body_list[pl] );
-                                l = p_jkl->thb;
-                                plk = p_jkl->pthb; //pointer to l on k's bond_list!
-                                pbond_kl = &( bonds->select.bond_list[plk] );
-                                bo_kl = &( pbond_kl->bo_data );
-                                type_l = system->atoms[l].type;
-                                fbh = &(system->reaxprm.fbp[type_i][type_j][type_k][type_l]);
-                                fbp = &(system->reaxprm.fbp[type_i][type_j]
-                                        [type_k][type_l].prm[0]);
-
-                                if ( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ )
+                                /* pick l up from j-k intr. where k is the centre */
+                                for ( pl = start_pj; pl < end_pj; ++pl )
                                 {
-                                    ++num_frb_intrs;
-                                    r_kl = pbond_kl->d;
-                                    BOA_kl = bo_kl->BO - control->thb_cut;
-
-                                    theta_jkl = p_jkl->theta;
-                                    sin_jkl = SIN( theta_jkl );
-                                    cos_jkl = COS( theta_jkl );
-                                    //tan_jkl_i = 1. / TAN( theta_jkl );
-                                    if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
-                                        tan_jkl_i = cos_jkl / MIN_SINE;
-                                    else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
-                                        tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl / sin_jkl;
-
-                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x,
-                                                       &(system->box), dvec_li );
-                                    r_li = rvec_Norm( dvec_li );
-
-
-                                    /* omega and its derivative */
-                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec,
-                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec,
-                                                            r_jk, pbond_kl->dvec, r_kl,
-                                                            dvec_li, r_li, p_ijk, p_jkl,
-                                                            dcos_omega_di, dcos_omega_dj,
-                                                            dcos_omega_dk, dcos_omega_dl,
-                                                            out_control);
-                                    cos_omega = COS( omega );
-                                    cos2omega = COS( 2. * omega );
-                                    cos3omega = COS( 3. * omega );
-                                    /* end omega calculations */
-
-                                    /* torsion energy */
-                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2. - bo_jk->BO_pi - f11_DjDk));
-                                    exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
-                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
-                                           (1.0 - exp_tor2_kl);
-
-                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) +
-                                                 fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-                                                 fbp->V3 * (1.0 + cos3omega) );
-                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) +
-                                    //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
-                                    //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
-
-                                    data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
-
-                                    dfn11 = (-p_tor3 * exp_tor3_DjDk +
-                                             (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-                                             (2. + exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
-
-                                    CEtors1 = sin_ijk * sin_jkl * CV;
-
-                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
-                                              (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
-                                              sin_ijk * sin_jkl;
-
-                                    CEtors3 = CEtors2 * dfn11;
-
-                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij *
-                                              (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
-
-                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk *
-                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
-
-                                    CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-                                              (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
-
-                                    cmn = -fn10 * CV;
-                                    CEtors7 = cmn * sin_jkl * tan_ijk_i;
-                                    CEtors8 = cmn * sin_ijk * tan_jkl_i;
-                                    CEtors9 = fn10 * sin_ijk * sin_jkl *
-                                              (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                               1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
-                                    //cmn = -fn10 * CV;
-                                    //CEtors7 = cmn * sin_jkl * cos_ijk;
-                                    //CEtors8 = cmn * sin_ijk * cos_jkl;
-                                    //CEtors9 = fn10 * sin_ijk * sin_jkl *
-                                    //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                    //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
-                                    /* end  of torsion energy */
-
-
-                                    /* 4-body conjugation energy */
-                                    fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                    data->E_Con += e_con = fbp->p_cot1 * fn12 *
-                                                           (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
-
-                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
+                                    p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                                    l = p_jkl->thb;
+                                    plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                                    pbond_kl = &( bonds->select.bond_list[plk] );
+                                    bo_kl = &( pbond_kl->bo_data );
+                                    type_l = system->atoms[l].type;
+                                    fbh = &(system->reaxprm.fbp[type_i][type_j][type_k][type_l]);
+                                    fbp = &(system->reaxprm.fbp[type_i][type_j]
+                                            [type_k][type_l].prm[0]);
+
+                                    if ( i != l && fbh->cnt && bo_kl->BO > control->thb_cut &&
+                                            bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut )
+                                    {
+#ifdef _OPENMP
+                                        f_l = &(workspace->f_local[tid * system->N + l]);
+#else
+                                        f_l = &(system->atoms[l].f);
+#endif
+
+#ifdef TEST_FORCES
+                                        ++num_frb_intrs;
+#endif
+
+                                        r_kl = pbond_kl->d;
+                                        BOA_kl = bo_kl->BO - control->thb_cut;
+
+                                        theta_jkl = p_jkl->theta;
+                                        sin_jkl = SIN( theta_jkl );
+                                        cos_jkl = COS( theta_jkl );
+                                        //tan_jkl_i = 1. / TAN( theta_jkl );
+                                        if ( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+                                        {
+                                            tan_jkl_i = cos_jkl / MIN_SINE;
+                                        }
+                                        else if ( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+                                        {
+                                            tan_jkl_i = cos_jkl / -MIN_SINE;
+                                        }
+                                        else
+                                        {
+                                            tan_jkl_i = cos_jkl / sin_jkl;
+                                        }
+
+                                        Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x,
+                                                &(system->box), dvec_li );
+                                        r_li = rvec_Norm( dvec_li );
+
+                                        /* omega and its derivative */
+                                        //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec,
+                                        omega = Calculate_Omega( pbond_ij->dvec, r_ij, pbond_jk->dvec,
+                                                r_jk, pbond_kl->dvec, r_kl, dvec_li, r_li, p_ijk, p_jkl,
+                                                dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl,
+                                                out_control );
+                                        cos_omega = COS( omega );
+                                        cos2omega = COS( 2. * omega );
+                                        cos3omega = COS( 3. * omega );
+                                        /* end omega calculations */
+
+                                        /* torsion energy */
+                                        exp_tor1 = EXP( fbp->p_tor1 * SQR(2. - bo_jk->BO_pi - f11_DjDk) );
+                                        exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                        exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
+                                        fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
+                                               (1.0 - exp_tor2_kl);
+
+                                        CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) +
+                                                     fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                                     fbp->V3 * (1.0 + cos3omega) );
+                                        //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) +
+                                        //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
+                                        //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
+
+                                        e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                        e_tor_total += e_tor;
+
+                                        dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                                                (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                                (2. + exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+
+                                        CEtors1 = sin_ijk * sin_jkl * CV;
+
+                                        CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
+                                            (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
+                                            sin_ijk * sin_jkl;
+
+                                        CEtors3 = CEtors2 * dfn11;
+
+                                        CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij *
+                                            (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+
+                                        CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk *
+                                            (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+
+                                        CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
+                                            (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+
+                                        cmn = -fn10 * CV;
+                                        CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                                        CEtors8 = cmn * sin_ijk * tan_jkl_i;
+                                        CEtors9 = fn10 * sin_ijk * sin_jkl *
+                                            (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                             1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                        //cmn = -fn10 * CV;
+                                        //CEtors7 = cmn * sin_jkl * cos_ijk;
+                                        //CEtors8 = cmn * sin_ijk * cos_jkl;
+                                        //CEtors9 = fn10 * sin_ijk * sin_jkl *
+                                        //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                        //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
+                                        /* end  of torsion energy */
+
+                                        /* 4-body conjugation energy */
+                                        fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                                        e_con = fbp->p_cot1 * fn12 *
                                             (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
+                                        e_con_total += e_con;
+
+                                        Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
+                                                (1. + (SQR(cos_omega) - 1.) * sin_ijk * sin_jkl);
+
+                                        CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                                        CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                                        CEconj3 = Cconj * (BOA_kl - 1.5e0);
+
+                                        CEconj4 = -fbp->p_cot1 * fn12 *
+                                                  (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                        CEconj5 = -fbp->p_cot1 * fn12 *
+                                                  (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                        //CEconj4 = -fbp->p_cot1 * fn12 *
+                                        //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
+                                        //CEconj5 = -fbp->p_cot1 * fn12 *
+                                        //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
+                                        CEconj6 = 2.0 * fbp->p_cot1 * fn12 *
+                                                  cos_omega * sin_ijk * sin_jkl;
+                                        /* end 4-body conjugation energy */
+
+                                        //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
+                                        //   workspace->orig_id[i], workspace->orig_id[j],
+                                        //       workspace->orig_id[k], workspace->orig_id[l],
+                                        //    omega, cos_omega, cos2omega, cos3omega );
+                                        //fprintf(stdout,
+                                        //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                        //    CEtors2, CEtors3, CEtors4, CEtors5,
+                                        //    CEtors6, CEtors7, CEtors8, CEtors9 );
+                                        //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                        //    theta_ijk, theta_jkl, sin_ijk,
+                                        //    sin_jkl, cos_jkl, tan_jkl_i );
+
+                                        /* forces */
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        bo_jk->Cdbopi += CEtors2;
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        workspace->CdDelta[j] += CEtors3;
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        workspace->CdDelta[k] += CEtors3;
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        bo_ij->Cdbo += (CEtors4 + CEconj1);
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        bo_jk->Cdbo += (CEtors5 + CEconj2);
+#ifdef _OPENMP
+                                        #pragma omp atomic
+#endif
+                                        bo_kl->Cdbo += (CEtors6 + CEconj3);
+
+                                        if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                                        {
+                                            /* dcos_theta_ijk */
+                                            rvec_ScaledAdd( *f_i,
+                                                    CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                            rvec_ScaledAdd( *f_j,
+                                                    CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                            rvec_ScaledAdd( *f_k,
+                                                    CEtors7 + CEconj4, p_ijk->dcos_di );
+
+                                            /* dcos_theta_jkl */
+                                            rvec_ScaledAdd( *f_j,
+                                                    CEtors8 + CEconj5, p_jkl->dcos_di );
+                                            rvec_ScaledAdd( *f_k,
+                                                    CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                            rvec_ScaledAdd( *f_l,
+                                                    CEtors8 + CEconj5, p_jkl->dcos_dk );
+
+                                            /* dcos_omega */
+                                            rvec_ScaledAdd( *f_i,
+                                                    CEtors9 + CEconj6, dcos_omega_di );
+                                            rvec_ScaledAdd( *f_j,
+                                                    CEtors9 + CEconj6, dcos_omega_dj );
+                                            rvec_ScaledAdd( *f_k,
+                                                    CEtors9 + CEconj6, dcos_omega_dk );
+                                            rvec_ScaledAdd( *f_l,
+                                                    CEtors9 + CEconj6, dcos_omega_dl );
+                                        }
+                                        else
+                                        {
+                                            ivec_Sum( rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box );
+
+                                            /* dcos_theta_ijk */
+                                            rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                            rvec_Add( *f_i, force );
+                                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
 
-                                    CEconj1 = Cconj * (BOA_ij - 1.5e0);
-                                    CEconj2 = Cconj * (BOA_jk - 1.5e0);
-                                    CEconj3 = Cconj * (BOA_kl - 1.5e0);
-
-                                    CEconj4 = -fbp->p_cot1 * fn12 *
-                                              (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-                                    CEconj5 = -fbp->p_cot1 * fn12 *
-                                              (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-                                    //CEconj4 = -fbp->p_cot1 * fn12 *
-                                    //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-                                    //CEconj5 = -fbp->p_cot1 * fn12 *
-                                    //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 *
-                                              cos_omega * sin_ijk * sin_jkl;
-                                    /* end 4-body conjugation energy */
-
-                                    //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
-                                    //   workspace->orig_id[i], workspace->orig_id[j],
-                                    //       workspace->orig_id[k], workspace->orig_id[l],
-                                    //    omega, cos_omega, cos2omega, cos3omega );
-                                    //fprintf(stdout,
-                                    //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    CEtors2, CEtors3, CEtors4, CEtors5,
-                                    //    CEtors6, CEtors7, CEtors8, CEtors9 );
-                                    //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    theta_ijk, theta_jkl, sin_ijk,
-                                    //    sin_jkl, cos_jkl, tan_jkl_i );
-
-                                    /* forces */
-                                    bo_jk->Cdbopi += CEtors2;
-                                    workspace->CdDelta[j] += CEtors3;
-                                    workspace->CdDelta[k] += CEtors3;
-                                    bo_ij->Cdbo += (CEtors4 + CEconj1);
-                                    bo_jk->Cdbo += (CEtors5 + CEconj2);
-                                    bo_kl->Cdbo += (CEtors6 + CEconj3);
-
-                                    if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
-                                    {
-                                        /* dcos_theta_ijk */
-                                        rvec_ScaledAdd( system->atoms[i].f,
-                                                        CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f,
-                                                        CEtors7 + CEconj4, p_ijk->dcos_di );
-
-                                        /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
-                                        rvec_ScaledAdd( system->atoms[k].f,
-                                                        CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[l].f,
-                                                        CEtors8 + CEconj5, p_jkl->dcos_dk );
-
-                                        /* dcos_omega */
-                                        rvec_ScaledAdd( system->atoms[i].f,
-                                                        CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors9 + CEconj6, dcos_omega_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f,
-                                                        CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_ScaledAdd( system->atoms[l].f,
-                                                        CEtors9 + CEconj6, dcos_omega_dl );
-                                    }
-                                    else
-                                    {
-                                        ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
-
-                                        /* dcos_theta_ijk */
-                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_Add( system->atoms[i].f, force );
-                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors7 + CEconj4, p_ijk->dcos_dj );
-
-                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-
-                                        /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors8 + CEconj5, p_jkl->dcos_di );
-
-                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-                                        rvec_Add( system->atoms[l].f, force );
-                                        rvec_iMultiply( ext_press, rel_box_jl, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-
-                                        /* dcos_omega */
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_Add( system->atoms[i].f, force );
-                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-                                        rvec_ScaledAdd( system->atoms[j].f,
-                                                        CEtors9 + CEconj6, dcos_omega_dj );
-
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-                                        rvec_Add( system->atoms[l].f, force );
-                                        rvec_iMultiply( ext_press, rel_box_jl, force );
-                                        rvec_Add( data->ext_press, ext_press );
-
-
-                                        /* This part is intended for a fully-flexible box */
-                                        /* rvec_ScaledSum( temp_rvec,
-                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i
-                                           CEtors9 + CEconj6, dcos_omega_di );
-                                           rvec_OuterProduct( temp_rtensor,
-                                           temp_rvec, system->atoms[i].x );
-                                           rtensor_Copy( total_rtensor, temp_rtensor );
-
-                                           rvec_ScaledSum( temp_rvec,
-                                           CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
-                                           CEtors8 + CEconj5, p_jkl->dcos_di );
-                                           rvec_ScaledAdd( temp_rvec,
-                                           CEtors9 + CEconj6, dcos_omega_dj );
-                                           rvec_OuterProduct( temp_rtensor,
-                                           temp_rvec, system->atoms[j].x );
-                                           rtensor_Add( total_rtensor, temp_rtensor );
-
-                                           rvec_ScaledSum( temp_rvec,
-                                           CEtors7 + CEconj4, p_ijk->dcos_di,      // k
-                                           CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                           rvec_ScaledAdd( temp_rvec,
-                                           CEtors9 + CEconj6, dcos_omega_dk );
-                                           rvec_OuterProduct( temp_rtensor,
-                                           temp_rvec, system->atoms[k].x );
-                                           rtensor_Add( total_rtensor, temp_rtensor );
-
-                                           rvec_ScaledSum( temp_rvec,
-                                           CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
-                                           CEtors9 + CEconj6, dcos_omega_dl );
-                                           rvec_OuterProduct( temp_rtensor,
-                                           temp_rvec, system->atoms[l].x );
-                                           rtensor_Copy( total_rtensor, temp_rtensor );
-
-                                           if( pbond_ij->imaginary || pbond_jk->imaginary ||
-                                           pbond_kl->imaginary )
-                                           rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
-                                           else
-                                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                                    }
+                                            rvec_ScaledAdd( *f_j, CEtors7 + CEconj4, p_ijk->dcos_dj );
+
+                                            rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                            rvec_Add( *f_k, force );
+                                            rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            /* dcos_theta_jkl */
+                                            rvec_ScaledAdd( *f_j, CEtors8 + CEconj5, p_jkl->dcos_di );
+
+                                            rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                            rvec_Add( *f_k, force );
+                                            rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                            rvec_Add( *f_l, force );
+                                            rvec_iMultiply( ext_press, rel_box_jl, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            /* dcos_omega */
+                                            rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+                                            rvec_Add( *f_i, force );
+                                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            rvec_ScaledAdd( *f_j, CEtors9 + CEconj6, dcos_omega_dj );
+
+                                            rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+                                            rvec_Add( *f_k, force );
+                                            rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+                                            rvec_Add( *f_l, force );
+                                            rvec_iMultiply( ext_press, rel_box_jl, force );
+#ifdef _OPENMP
+                                            #pragma omp critical (Four_Body_Interactions_ext_press)
+#endif
+                                            {
+                                                rvec_Add( data->ext_press, ext_press );
+                                            }
+
+                                            /* This part is intended for a fully-flexible box */
+                                            /* rvec_ScaledSum( temp_rvec,
+                                               CEtors7 + CEconj4, p_ijk->dcos_dk,      // i
+                                               CEtors9 + CEconj6, dcos_omega_di );
+                                               rvec_OuterProduct( temp_rtensor,
+                                               temp_rvec, system->atoms[i].x );
+                                               rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                               rvec_ScaledSum( temp_rvec,
+                                               CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
+                                               CEtors8 + CEconj5, p_jkl->dcos_di );
+                                               rvec_ScaledAdd( temp_rvec,
+                                               CEtors9 + CEconj6, dcos_omega_dj );
+                                               rvec_OuterProduct( temp_rtensor,
+                                               temp_rvec, system->atoms[j].x );
+                                               rtensor_Add( total_rtensor, temp_rtensor );
+
+                                               rvec_ScaledSum( temp_rvec,
+                                               CEtors7 + CEconj4, p_ijk->dcos_di,      // k
+                                               CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                               rvec_ScaledAdd( temp_rvec,
+                                               CEtors9 + CEconj6, dcos_omega_dk );
+                                               rvec_OuterProduct( temp_rtensor,
+                                               temp_rvec, system->atoms[k].x );
+                                               rtensor_Add( total_rtensor, temp_rtensor );
+
+                                               rvec_ScaledSum( temp_rvec,
+                                               CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
+                                               CEtors9 + CEconj6, dcos_omega_dl );
+                                               rvec_OuterProduct( temp_rtensor,
+                                               temp_rvec, system->atoms[l].x );
+                                               rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                               if( pbond_ij->imaginary || pbond_jk->imaginary ||
+                                               pbond_kl->imaginary )
+                                               rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
+                                               else
+                                               rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                        }
 
 #ifdef TEST_ENERGY
-                                    /*fprintf( out_control->etor,
-                                       //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                       //r_ij, r_jk, r_kl,
-                                       "%12.8f%12.8f%12.8f%12.8f\n",
-                                       cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
-                                    // fprintf( out_control->etor, "%12.8f\n", dfn11 );
-                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n",
-                                             fn10, cos_omega, CV );
-
-                                    fprintf( out_control->etor,
-                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                             CEtors2, CEtors3, CEtors4, CEtors5,
-                                             CEtors6, CEtors7, CEtors8, CEtors9 );
-
-                                    /* fprintf( out_control->etor,
-                                       "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-
-                                    fprintf( out_control->etor,
-                                             "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                             CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
-                                    /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
-                                       fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-
-                                    fprintf( out_control->etor,
-                                             //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n",
-                                             "%6d%6d%6d%6d%12.8f%12.8f\n",
-                                             workspace->orig_id[i], workspace->orig_id[j],
-                                             workspace->orig_id[k], workspace->orig_id[l],
-                                             e_tor, e_con );
-                                    //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
-
-                                    fprintf( out_control->econ,
-                                             "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                             workspace->orig_id[i], workspace->orig_id[j],
-                                             workspace->orig_id[k], workspace->orig_id[l],
-                                             RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl,
-                                             e_con, data->E_Con );
-
-                                    /* fprintf( out_control->etor,
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
-
-
-                                    /* fprintf( out_control->etor,
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1],
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
-
-                                    fprintf( out_control->etor,
-                                             "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                             dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2],
-                                             dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2],
-                                             dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-                                             dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                        /*fprintf( out_control->etor,
+                                           //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                           //r_ij, r_jk, r_kl,
+                                           "%12.8f%12.8f%12.8f%12.8f\n",
+                                           cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                        // fprintf( out_control->etor, "%12.8f\n", dfn11 );
+                                        fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n",
+                                                 fn10, cos_omega, CV );
+
+                                        fprintf( out_control->etor,
+                                                 "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                                 CEtors2, CEtors3, CEtors4, CEtors5,
+                                                 CEtors6, CEtors7, CEtors8, CEtors9 );
+
+                                        /* fprintf( out_control->etor,
+                                           "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                           htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
+
+                                        fprintf( out_control->etor,
+                                                 "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                                 CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                        /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
+                                           fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
+
+                                        fprintf( out_control->etor,
+                                                 //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n",
+                                                 "%6d%6d%6d%6d%12.8f%12.8f\n",
+                                                 workspace->orig_id[i], workspace->orig_id[j],
+                                                 workspace->orig_id[k], workspace->orig_id[l],
+                                                 e_tor, e_con );
+                                        //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
+
+                                        fprintf( out_control->econ,
+                                                 "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                                 workspace->orig_id[i], workspace->orig_id[j],
+                                                 workspace->orig_id[k], workspace->orig_id[l],
+                                                 RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl,
+                                                 e_con, data->E_Con );
+
+                                        /* fprintf( out_control->etor,
+                                           "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dk[0],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dk[1],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dj[0],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dj[1],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_di[0],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_di[1],
+                                           (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
+
+
+                                        /* fprintf( out_control->etor,
+                                           "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_di[0],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_di[1],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_di[2],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dj[0],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dj[1],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dj[2],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dk[0],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dk[1],
+                                           (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
+
+                                        fprintf( out_control->etor,
+                                                 "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                                 dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2],
+                                                 dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2],
+                                                 dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                                 dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
 #endif
 
 #ifdef TEST_FORCES
-                                    /* Torsion Forces */
-                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0.,
-                                                  workspace->f_tor, workspace->f_tor);
-                                    Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
-                                    Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
-                                    Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
-                                    Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
-                                    Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
-
-                                    rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
-                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
-
-                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
-
-                                    rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
-                                    rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
-                                    rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
-                                    rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
-
-                                    /* Conjugation Forces */
-                                    Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
-                                    Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
-                                    Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
-
-                                    rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
-                                    rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
-
-                                    rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
-
-                                    rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
-                                    rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
-                                    rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
-                                    rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
+                                        /* Torsion Forces */
+                                        Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.,
+                                                workspace->f_tor, workspace->f_tor );
+                                        Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
+                                        Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
+                                        Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
+                                        Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
+                                        Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
+
+                                        rvec_ScaledAdd( workspace->f_tor[i], CEtors7, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( workspace->f_tor[j], CEtors7, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( workspace->f_tor[k], CEtors7, p_ijk->dcos_di );
+
+                                        rvec_ScaledAdd( workspace->f_tor[j], CEtors8, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( workspace->f_tor[k], CEtors8, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( workspace->f_tor[l], CEtors8, p_jkl->dcos_dk );
+
+                                        rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
+                                        rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
+                                        rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
+                                        rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
+
+                                        /* Conjugation Forces */
+                                        Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
+                                        Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
+                                        Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
+
+                                        rvec_ScaledAdd( workspace->f_con[i], CEconj4, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( workspace->f_con[j], CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( workspace->f_con[k], CEconj4, p_ijk->dcos_di );
+
+                                        rvec_ScaledAdd( workspace->f_con[j], CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( workspace->f_con[k], CEconj5, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( workspace->f_con[l], CEconj5, p_jkl->dcos_dk );
+
+                                        rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
+                                        rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
+                                        rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
 #endif
-                                } // pl check ends
-                            } // pl loop ends
-                        } // pi check ends
-                    } // pi loop ends
-                } // k-j neighbor check ends
-            } // j<k && j-k neighbor check ends
-        } // pk loop ends
-    } // j loop
+                                    } // pl check ends
+                                } // pl loop ends
+                            } // pi check ends
+                        } // pi loop ends
+                    } // k-j neighbor check ends
+                } // j<k && j-k neighbor check ends
+            } // pk loop ends
+        } // j loop
+    }
+
+     data->E_Tor += e_tor_total;
+     data->E_Con += e_con_total;
 
     /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n",
        data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
diff --git a/sPuReMD/src/geo_tools.c b/sPuReMD/src/geo_tools.c
index 01f404b4f6d51e73722b9f450f9ca8873dc3c9c9..442f1e0c7e09496d69840dba11fdc04f8274346e 100644
--- a/sPuReMD/src/geo_tools.c
+++ b/sPuReMD/src/geo_tools.c
@@ -214,21 +214,21 @@ void Count_PDB_Atoms( FILE *geo, reax_system *system )
 
 
 char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
-               simulation_data *data, static_storage *workspace )
+        simulation_data *data, static_storage *workspace )
 {
 
-    FILE  *pdb;
+    FILE *pdb;
     char **tmp;
-    char  *s, *s1;
-    char   descriptor[9], serial[9];
-    char   atom_name[9], res_name[9], res_seq[9];
-    char   s_x[9], s_y[9], s_z[9];
-    char   occupancy[9], temp_factor[9];
-    char   seg_id[9], element[9], charge[9];
-    char   alt_loc, chain_id, icode;
-    char  *endptr = NULL;
-    int    i, c, c1, pdb_serial, top;
-    rvec   x;
+    char *s, *s1;
+    char descriptor[9], serial[9];
+    char atom_name[9], res_name[9], res_seq[9];
+    char s_x[9], s_y[9], s_z[9];
+    char occupancy[9], temp_factor[9];
+    char seg_id[9], element[9], charge[9];
+    char alt_loc, chain_id, icode;
+    char *endptr = NULL;
+    int i, c, c1, pdb_serial, top;
+    rvec x;
     reax_atom *atom;
 
     /* open pdb file */
@@ -266,8 +266,9 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "starting to read the pdb file\n" );
 #endif
+
     fseek( pdb, 0, SEEK_SET );
-    c  = 0;
+    c = 0;
     c1 = 0;
     top = 0;
     s[0] = 0;
@@ -381,9 +382,9 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
             //       system->my_atoms[top].q, occupancy, temp_factor,
             //       seg_id, element );
 
-            //fprintf( stderr, "atom( %8.3f %8.3f %8.3f ) --> p%d\n",
-            // system->my_atoms[top].x[0], system->my_atoms[top].x[1],
-            // system->my_atoms[top].x[2], system->my_rank );
+//            fprintf( stderr, "atom( %8.3f %8.3f %8.3f )\n",
+//                    atom->x[0], atom->x[1],
+//                    atom->x[2] );
 
             c++;
         }
@@ -424,7 +425,9 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
         /* clear previous input line */
         s[0] = 0;
         for ( i = 0; i < c1; ++i )
+        {
             tmp[i][0] = 0;
+        }
     }
     if ( ferror( pdb ) )
     {
@@ -433,6 +436,8 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
 
     fclose( pdb );
 
+    Deallocate_Tokenizer_Space( &s, &s1, &tmp );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "finished reading the pdb file\n" );
 #endif
@@ -483,7 +488,7 @@ char Write_PDB( reax_system* system, list* bonds, simulation_data *data,
                   (system->box.box_norms[2] * system->box.box_norms[1]) );
 
     /*open pdb and write header*/
-    sprintf(fname, "%s-%d.pdb", control->sim_name, data->step);
+    sprintf( fname, "%s-%d.pdb", control->sim_name, data->step );
     pdb = fopen(fname, "w");
     fprintf( pdb, PDB_CRYST1_FORMAT_O,
              "CRYST1",
@@ -503,7 +508,11 @@ char Write_PDB( reax_system* system, list* bonds, simulation_data *data,
                  "ATOM  ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ',
                  p_atom->x[0], p_atom->x[1], p_atom->x[2],
                  1.0, 0.0, "0", name, "  " );
+
+#if defined(DEBUG)
         fprintf( stderr, "PDB NAME <%s>\n", p_atom->name );
+#endif
+
         strncpy( buffer + i * PDB_ATOM_FORMAT_O_LENGTH, line,
                  PDB_ATOM_FORMAT_O_LENGTH );
     }
@@ -533,8 +542,8 @@ char Write_PDB( reax_system* system, list* bonds, simulation_data *data,
     }
     */
 
-    free(buffer);
-    free(line);
+    free( buffer );
+    free( line );
 
     return SUCCESS;
 }
diff --git a/sPuReMD/src/grid.c b/sPuReMD/src/grid.c
index c45c1a2214d7589b3e4c1a647ebde1b9d3c7e56d..d48644f6628dc29bbf4899d03e8ce994ff7c4c49 100644
--- a/sPuReMD/src/grid.c
+++ b/sPuReMD/src/grid.c
@@ -69,7 +69,8 @@ void Allocate_Space_for_Grid( reax_system *system )
     grid *g;
 
     g = &(system->g);
-    g->max_nbrs = (2 * g->spread[0] + 1) * (2 * g->spread[1] + 1) * (2 * g->spread[2] + 1) + 3;
+    g->max_nbrs = (2 * g->spread[0] + 1)
+        * (2 * g->spread[1] + 1) * (2 * g->spread[2] + 1) + 3;
 
     /* allocate space for the new grid */
     g->atoms = (int****) calloc( g->ncell[0], sizeof( int*** ));
@@ -156,6 +157,8 @@ void Deallocate_Grid_Space( grid *g )
 
             free( g->atoms[i][j] );
             free( g->top[i][j] );
+            free( g->start[i][j] );
+            free( g->end[i][j] );
             free( g->mark[i][j] );
             free( g->nbrs[i][j] );
             free( g->nbrs_cp[i][j] );
@@ -163,6 +166,8 @@ void Deallocate_Grid_Space( grid *g )
 
         free( g->atoms[i] );
         free( g->top[i] );
+        free( g->start[i] );
+        free( g->end[i] );
         free( g->mark[i] );
         free( g->nbrs[i] );
         free( g->nbrs_cp[i] );
@@ -170,6 +175,8 @@ void Deallocate_Grid_Space( grid *g )
 
     free( g->atoms );
     free( g->top );
+    free( g->start );
+    free( g->end );
     free( g->mark );
     free( g->nbrs );
     free( g->nbrs_cp );
@@ -484,6 +491,12 @@ void Bin_Atoms( reax_system* system, static_storage *workspace )
 }
 
 
+void Finalize_Grid( reax_system* system )
+{
+    Deallocate_Grid_Space( &( system->g ) );
+}
+
+
 static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 {
     dest->type = src->type;
@@ -494,13 +507,12 @@ static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 
 
 void Copy_Storage( reax_system *system, static_storage *workspace,
-                   int top, int old_id, int old_type,
-                   int *num_H, real **v, real **s, real **t,
-                   int *orig_id, rvec *f_old )
+        control_params *control, int top, int old_id, int old_type, int *num_H,
+        real **v, real **s, real **t, int *orig_id, rvec *f_old )
 {
     int i;
 
-    for ( i = 0; i < RESTART + 1; ++i )
+    for ( i = 0; i < control->cm_solver_restart + 1; ++i )
     {
         v[i][top] = workspace->v[i][old_id];
     }
@@ -529,11 +541,11 @@ void Copy_Storage( reax_system *system, static_storage *workspace,
 }
 
 
-void Free_Storage( static_storage *workspace )
+void Free_Storage( static_storage *workspace, control_params * control )
 {
     int i;
 
-    for ( i = 0; i < RESTART + 1; ++i )
+    for ( i = 0; i < control->cm_solver_restart + 1; ++i )
     {
         free( workspace->v[i] );
     }
@@ -566,7 +578,8 @@ void Assign_New_Storage( static_storage *workspace,
 }
 
 
-void Cluster_Atoms( reax_system *system, static_storage *workspace )
+void Cluster_Atoms( reax_system *system, static_storage *workspace,
+        control_params *control )
 {
     int         i, j, k, l, top, old_id, num_H;
     reax_atom  *old_atom;
@@ -591,8 +604,8 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace )
         t[i] = (real *) calloc( system->N, sizeof( real ) );
     }
 
-    v = (real**) calloc( RESTART + 1, sizeof( real* ) );
-    for ( i = 0; i < RESTART + 1; ++i )
+    v = (real**) calloc( control->cm_solver_restart + 1, sizeof( real* ) );
+    for ( i = 0; i < control->cm_solver_restart + 1; ++i )
     {
         v[i] = (real *) calloc( system->N, sizeof( real ) );
     }
@@ -614,8 +627,8 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace )
                     // fprintf( stderr, "%d <-- %d\n", top, old_id );
 
                     reax_atom_Copy( &(new_atoms[top]), old_atom );
-                    Copy_Storage( system, workspace, top, old_id, old_atom->type,
-                                  &num_H, v, s, t, orig_id, f_old );
+                    Copy_Storage( system, workspace, control, top, old_id, old_atom->type,
+                            &num_H, v, s, t, orig_id, f_old );
                     ++top;
                 }
 
@@ -626,7 +639,7 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace )
 
 
     free( system->atoms );
-    Free_Storage( workspace );
+    Free_Storage( workspace, control );
 
     system->atoms = new_atoms;
     Assign_New_Storage( workspace, v, s, t, orig_id, f_old );
diff --git a/sPuReMD/src/grid.h b/sPuReMD/src/grid.h
index 41d7b57edb2ba271bd41f71c6a77d5f179370910..6e83b740ed0a7f8035b45ad0423ef62e2ab0c850 100644
--- a/sPuReMD/src/grid.h
+++ b/sPuReMD/src/grid.h
@@ -24,16 +24,20 @@
 
 #include "mytypes.h"
 
+
 void Setup_Grid( reax_system* );
 
 void Update_Grid( reax_system* );
 
+void Finalize_Grid( reax_system* );
+
 int  Shift( int, int, int, grid* );
 
-void Cluster_Atoms( reax_system*, static_storage* );
+void Cluster_Atoms( reax_system *, static_storage *, control_params * );
 
 void Bin_Atoms( reax_system*, static_storage* );
 
 void Reset_Marks( grid*, ivec*, int );
 
+
 #endif
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index af3e234b65c8cd763399646ba014a3bfcf0113f9..261214683414377ee829e33f31c1ad1e2f97eca0 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -41,11 +41,13 @@ void Generate_Initial_Velocities( reax_system *system, real T )
     int i;
     real scale, norm;
 
-
     if ( T <= 0.1 )
     {
-        for (i = 0; i < system->N; i++)
+        for ( i = 0; i < system->N; i++ )
+        {
             rvec_MakeZero( system->atoms[i].v );
+        }
+
 #if defined(DEBUG)
         fprintf( stderr, "no random velocities...\n" );
 #endif
@@ -73,13 +75,15 @@ void Generate_Initial_Velocities( reax_system *system, real T )
 
 
 void Init_System( reax_system *system, control_params *control,
-                  simulation_data *data )
+        simulation_data *data )
 {
     int i;
     rvec dx;
 
     if ( !control->restart )
+    {
         Reset_Atoms( system );
+    }
 
     Compute_Total_Mass( system, data );
     Compute_Center_of_Mass( system, data, stderr );
@@ -117,21 +121,26 @@ void Init_System( reax_system *system, control_params *control,
 
     /* Initialize velocities so that desired init T can be attained */
     if ( !control->restart || (control->restart && control->random_vel) )
+    {
         Generate_Initial_Velocities( system, control->T_init );
+    }
 
     Setup_Grid( system );
 }
 
 
 void Init_Simulation_Data( reax_system *system, control_params *control,
-                           simulation_data *data, output_controls *out_control,
-                           evolve_function *Evolve )
+        simulation_data *data, output_controls *out_control,
+        evolve_function *Evolve )
 {
 
     Reset_Simulation_Data( data );
 
     if ( !control->restart )
-        data->step = data->prev_steps = 0;
+    {
+        data->step = 0;
+        data->prev_steps = 0;
+    }
 
     switch ( control->ensemble )
     {
@@ -169,9 +178,9 @@ void Init_Simulation_Data( reax_system *system, control_params *control,
         if ( !control->restart )
         {
             data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin -
-                                                 data->N_f * K_B * control->T );
+                    data->N_f * K_B * control->T);
             data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->iso_bar.eps = 0.33333 * log(system->box.volume);
+            data->iso_bar.eps = 1.0 / 3.0 * LOG( system->box.volume );
             //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
             //Compute_Pressure( system, data, workspace );
         }
@@ -209,15 +218,15 @@ void Init_Simulation_Data( reax_system *system, control_params *control,
     data->timing.init_forces = 0;
     data->timing.bonded = 0;
     data->timing.nonb = 0;
-    data->timing.QEq = ZERO;
-    data->timing.QEq_sort_mat_rows = ZERO;
-    data->timing.pre_comp = ZERO;
-    data->timing.pre_app = ZERO;
-    data->timing.solver_iters = 0;
-    data->timing.solver_spmv = ZERO;
-    data->timing.solver_vector_ops = ZERO;
-    data->timing.solver_orthog = ZERO;
-    data->timing.solver_tri_solve = ZERO;
+    data->timing.cm = ZERO;
+    data->timing.cm_sort_mat_rows = ZERO;
+    data->timing.cm_solver_pre_comp = ZERO;
+    data->timing.cm_solver_pre_app = ZERO;
+    data->timing.cm_solver_iters = 0;
+    data->timing.cm_solver_spmv = ZERO;
+    data->timing.cm_solver_vector_ops = ZERO;
+    data->timing.cm_solver_orthog = ZERO;
+    data->timing.cm_solver_tri_solve = ZERO;
 }
 
 
@@ -231,17 +240,20 @@ void Init_Taper( control_params *control )
     swa = control->r_low;
     swb = control->r_cut;
 
-    if ( fabs( swa ) > 0.01 )
+    if ( FABS( swa ) > 0.01 )
+    {
         fprintf( stderr, "Warning: non-zero value for lower Taper-radius cutoff\n" );
+    }
 
-    if ( swb < 0 )
+    if ( swb < 0.0 )
     {
         fprintf( stderr, "Negative value for upper Taper-radius cutoff\n" );
         exit( INVALID_INPUT );
     }
-    else if ( swb < 5 )
-        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n",
-                 swb );
+    else if ( swb < 5.0 )
+    {
+        fprintf( stderr, "Warning: low value for upper Taper-radius cutoff:%f\n", swb );
+    }
 
     d1 = swb - swa;
     d7 = POW( d1, 7.0 );
@@ -258,12 +270,12 @@ void Init_Taper( control_params *control )
     control->Tap2 = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
     control->Tap1 = 140.0 * swa3 * swb3 / d7;
     control->Tap0 = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                     7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+            7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
 }
 
 
 void Init_Workspace( reax_system *system, control_params *control,
-                     static_storage *workspace )
+        static_storage *workspace )
 {
     int i;
 
@@ -289,70 +301,159 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
     workspace->vlpex        = (real *) malloc( system->N * sizeof( real ) );
 
-    /* QEq storage */
-    workspace->H        = NULL;
-    workspace->H_sp     = NULL;
-    workspace->L        = NULL;
-    workspace->U        = NULL;
+    /* charge method storage */
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            system->N_cm = system->N;
+            break;
+        case EE_CM:
+            system->N_cm = system->N + 1;
+            break;
+        case ACKS2_CM:
+            system->N_cm = 2 * system->N + 2;
+            break;
+        default:
+            fprintf( stderr, "Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    workspace->H = NULL;
+    workspace->H_sp = NULL;
+    workspace->L = NULL;
+    workspace->U = NULL;
     workspace->Hdia_inv = NULL;
-    workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
-    workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b_prc    = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->b_prm    = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->s_t      = (real *) calloc( system->N * 2, sizeof( real ) );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        workspace->droptol  = (real *) calloc( system->N_cm, sizeof( real ) );
+    }
+    //TODO: check if unused
+    //workspace->w        = (real *) calloc( cm_lin_sys_size, sizeof( real ) );
+    //TODO: check if unused
+    workspace->b        = (real *) calloc( system->N_cm * 2, sizeof( real ) );
+    workspace->b_s      = (real *) calloc( system->N_cm, sizeof( real ) );
+    workspace->b_t      = (real *) calloc( system->N_cm, sizeof( real ) );
+    workspace->b_prc    = (real *) calloc( system->N_cm * 2, sizeof( real ) );
+    workspace->b_prm    = (real *) calloc( system->N_cm * 2, sizeof( real ) );
     workspace->s        = (real**) calloc( 5, sizeof( real* ) );
     workspace->t        = (real**) calloc( 5, sizeof( real* ) );
     for ( i = 0; i < 5; ++i )
     {
-        workspace->s[i] = (real *) calloc( system->N, sizeof( real ) );
-        workspace->t[i] = (real *) calloc( system->N, sizeof( real ) );
+        workspace->s[i] = (real *) calloc( system->N_cm, sizeof( real ) );
+        workspace->t[i] = (real *) calloc( system->N_cm, sizeof( real ) );
     }
-    // workspace->s_old    = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->t_old    = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
 
-    for ( i = 0; i < system->N; ++i )
+    switch ( control->charge_method )
     {
-        workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b_t[i] = -1.0;
-
-        workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b[i + system->N] = -1.0;
+        case QEQ_CM:
+            for ( i = 0; i < system->N; ++i )
+            {
+                workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+                workspace->b_t[i] = -1.0;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+                workspace->b[i + system->N] = -1.0;
+            }
+            break;
+
+        case EE_CM:
+            for ( i = 0; i < system->N; ++i )
+            {
+                workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+            }
+
+            workspace->b_s[system->N] = control->cm_q_net;
+            workspace->b[system->N] = control->cm_q_net;
+            break;
+
+        case ACKS2_CM:
+            for ( i = 0; i < system->N; ++i )
+            {
+                workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+            }
+
+            workspace->b_s[system->N] = control->cm_q_net;
+            workspace->b[system->N] = control->cm_q_net;
+
+            for ( i = system->N + 1; i < system->N_cm; ++i )
+            {
+                workspace->b_s[i] = 0.0;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = 0.0;
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 
-    /* GMRES storage */
-    workspace->y  = (real *)  calloc( RESTART + 1, sizeof( real ) );
-    workspace->z  = (real *)  calloc( RESTART + 1, sizeof( real ) );
-    workspace->g  = (real *)  calloc( RESTART + 1, sizeof( real ) );
-    workspace->h  = (real **) calloc( RESTART + 1, sizeof( real*) );
-    workspace->hs = (real *)  calloc( RESTART + 1, sizeof( real ) );
-    workspace->hc = (real *)  calloc( RESTART + 1, sizeof( real ) );
-    workspace->rn = (real **) calloc( RESTART + 1, sizeof( real*) );
-    workspace->v  = (real **) calloc( RESTART + 1, sizeof( real*) );
-
-    for ( i = 0; i < RESTART + 1; ++i )
+    switch ( control->cm_solver_type )
     {
-        workspace->h[i]  = (real *) calloc( RESTART + 1, sizeof( real ) );
-        workspace->rn[i] = (real *) calloc( system->N * 2, sizeof( real ) );
-        workspace->v[i]  = (real *) calloc( system->N, sizeof( real ) );
+        /* GMRES storage */
+        case GMRES_S:
+        case GMRES_H_S:
+            workspace->y  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
+            workspace->z  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
+            workspace->g  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
+            workspace->h  = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
+            workspace->hs = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
+            workspace->hc = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
+            workspace->rn = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
+            workspace->v  = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
+
+            for ( i = 0; i < control->cm_solver_restart + 1; ++i )
+            {
+                workspace->h[i]  = (real *) calloc( control->cm_solver_restart + 1, sizeof( real ) );
+                workspace->rn[i] = (real *) calloc( system->N_cm * 2, sizeof( real ) );
+                workspace->v[i]  = (real *) calloc( system->N_cm, sizeof( real ) );
+            }
+
+            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->p = (real *) calloc( system->N_cm, sizeof( real ) );
+            break;
+
+        /* CG storage */
+        case CG_S:
+            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->p = (real *) calloc( system->N_cm, sizeof( real ) );
+            break;
+
+        case SDM_S:
+            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method linear solver type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 
-    /* CG storage */
-    workspace->r = (real *) calloc( system->N, sizeof( real ) );
-    workspace->d = (real *) calloc( system->N, sizeof( real ) );
-    workspace->q = (real *) calloc( system->N, sizeof( real ) );
-    workspace->p = (real *) calloc( system->N, sizeof( real ) );
-
-
     /* integrator storage */
     workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
     workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
     workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
 
+#ifdef _OPENMP
+    workspace->f_local = (rvec *) malloc( control->num_threads * system->N * sizeof( rvec ) );
+#endif
 
     /* storage for analysis */
     if ( control->molec_anal || control->diffusion_coef )
@@ -361,12 +462,18 @@ void Init_Workspace( reax_system *system, control_params *control,
         workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
     }
     else
+    {
         workspace->mark = workspace->old_mark = NULL;
+    }
 
     if ( control->diffusion_coef )
+    {
         workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
-    else workspace->x_old = NULL;
-
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
 #ifdef TEST_FORCES
     workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
@@ -400,18 +507,23 @@ void Init_Workspace( reax_system *system, control_params *control,
 
 
 void Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, static_storage *workspace,
-                 list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+    int i, num_nbrs, num_bonds, num_3body, Htop, max_nnz;
     int *hb_top, *bond_top;
+#if defined(DEBUG_FOCUS)
+    int num_hbonds;
+#endif
 
     num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
+
     if ( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS) )
     {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
         exit( CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
              num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024) );
@@ -422,10 +534,26 @@ void Init_Lists( reax_system *system, control_params *control,
     hb_top = (int*) calloc( system->N, sizeof(int) );
     bond_top = (int*) calloc( system->N, sizeof(int) );
     num_3body = 0;
-    Estimate_Storage_Sizes( system, control, lists,
-                            &Htop, hb_top, bond_top, &num_3body );
+    Estimate_Storage_Sizes( system, control, lists, &Htop,
+            hb_top, bond_top, &num_3body );
+
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            max_nnz = Htop;
+            break;
+        case EE_CM:
+            max_nnz = Htop + system->N_cm;
+            break;
+        case ACKS2_CM:
+            max_nnz = 2 * Htop + 3 * system->N + 2;
+            break;
+        default:
+            max_nnz = Htop;
+            break;
+    }
 
-    if ( Allocate_Matrix( &(workspace->H), system->N, Htop ) == FAILURE )
+    if ( Allocate_Matrix( &(workspace->H), system->N_cm, max_nnz ) == FAILURE )
     {
         fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
         exit( INSUFFICIENT_MEMORY );
@@ -434,15 +562,16 @@ void Init_Lists( reax_system *system, control_params *control,
      *   If so, need to refactor Estimate_Storage_Sizes
      *   to use various cut-off distances as parameters
      *   (non-bonded, hydrogen, 3body, etc.) */
-    if ( Allocate_Matrix( &(workspace->H_sp), system->N, Htop ) == FAILURE )
+    if ( Allocate_Matrix( &(workspace->H_sp), system->N_cm, max_nnz ) == FAILURE )
     {
         fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
         exit( INSUFFICIENT_MEMORY );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
     fprintf( stderr, "memory allocated: H = %ldMB\n",
-             Htop * sizeof(sparse_matrix_entry) / (1024 * 1024) );
+            Htop * sizeof(sparse_matrix_entry) / (1024 * 1024) );
 #endif
 
     workspace->num_H = 0;
@@ -450,14 +579,23 @@ void Init_Lists( reax_system *system, control_params *control,
     {
         /* init H indexes */
         for ( i = 0; i < system->N; ++i )
-            if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
+        {
+            // H atom
+            if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 )
+            {
                 workspace->hbond_index[i] = workspace->num_H++;
-            else workspace->hbond_index[i] = -1;
+            }
+            else
+            {
+                workspace->hbond_index[i] = -1;
+            }
+        }
 
         Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index,
-                             hb_top, (*lists) + HBONDS );
-        num_hbonds = hb_top[system->N - 1];
+                hb_top, (*lists) + HBONDS );
+
 #if defined(DEBUG_FOCUS)
+        num_hbonds = hb_top[system->N - 1];
         fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
         fprintf( stderr, "memory allocated: hbonds = %ldMB\n",
                  num_hbonds * sizeof(hbond_data) / (1024 * 1024) );
@@ -467,27 +605,26 @@ void Init_Lists( reax_system *system, control_params *control,
     /* bonds list */
     Allocate_Bond_List( system->N, bond_top, (*lists) + BONDS );
     num_bonds = bond_top[system->N - 1];
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
     fprintf( stderr, "memory allocated: bonds = %ldMB\n",
              num_bonds * sizeof(bond_data) / (1024 * 1024) );
 #endif
 
-//fprintf (stderr, " **** sizeof 3 body : %d \n", sizeof (three_body_interaction_data));
-//fprintf (stderr, " **** num_3body : %d \n", num_3body);
-//fprintf (stderr, " **** num_bonds : %d \n", num_bonds);
-
     /* 3bodies list */
     if (!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists) + THREE_BODIES))
     {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
         exit( CANNOT_INITIALIZE );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
     fprintf( stderr, "memory allocated: 3-body = %ldMB\n",
              num_3body * sizeof(three_body_interaction_data) / (1024 * 1024) );
 #endif
+
 #ifdef TEST_FORCES
     if (!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA ))
     {
@@ -507,8 +644,8 @@ void Init_Lists( reax_system *system, control_params *control,
 }
 
 
-void Init_Out_Controls(reax_system *system, control_params *control,
-                       static_storage *workspace, output_controls *out_control)
+void Init_Out_Controls( reax_system *system, control_params *control,
+        static_storage *workspace, output_controls *out_control )
 {
     char temp[1000];
 
@@ -548,7 +685,7 @@ void Init_Out_Controls(reax_system *system, control_params *control,
         out_control->log = fopen( temp, "w" );
         fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
                  "step", "total", "neighbors", "init", "bonded",
-                 "nonbonded", "QEq", "QEq Sort", "S iters", "Pre Comp", "Pre App",
+                 "nonbonded", "CM", "CM Sort", "S iters", "Pre Comp", "Pre App",
                  "S spmv", "S vec ops", "S orthog", "S tsolve" );
     }
 
@@ -733,12 +870,23 @@ void Init_Out_Controls(reax_system *system, control_params *control,
 }
 
 
-void Initialize(reax_system *system, control_params *control,
-                simulation_data *data, static_storage *workspace, list **lists,
-                output_controls *out_control, evolve_function *Evolve)
+void Initialize( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control, evolve_function *Evolve )
 {
+#if defined(DEBUG)
     real start, end;
-    Randomize();
+#endif
+
+#ifdef _OPENMP
+    #pragma omp parallel default(shared)
+    {
+        #pragma omp single
+        control->num_threads = omp_get_num_threads( );
+    }
+#endif
+
+    Randomize( );
 
     Init_System( system, control, data );
 
@@ -752,20 +900,399 @@ void Initialize(reax_system *system, control_params *control,
 
     /* These are done in forces.c, only forces.c can see all those functions */
     Init_Bonded_Force_Functions( control );
+
 #ifdef TEST_FORCES
     Init_Force_Test_Functions( );
 #endif
 
     if ( control->tabulate )
     {
-        start = Get_Time ();
+#if defined(DEBUG)
+        start = Get_Time( );
+#endif
+
         Make_LR_Lookup_Table( system, control );
-        end = Get_Timing_Info (start);
 
-        //fprintf (stderr, "Time for LR Lookup Table calculation is %f \n", end );
+#if defined(DEBUG)
+        end = Get_Timing_Info( start );
+
+        fprintf( stderr, "Time for LR Lookup Table calculation is %f \n", end );
+#endif
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "data structures have been initialized...\n" );
 #endif
 }
+
+
+void Finalize_System( reax_system *system, control_params *control,
+        simulation_data *data )
+{
+    int i, j, k;
+    reax_interaction *reax;
+
+    reax = &( system->reaxprm );
+
+    Finalize_Grid( system );
+
+    free( reax->gp.l );
+
+    for ( i = 0; i < reax->num_atom_types; i++ )
+    {
+        for ( j = 0; j < reax->num_atom_types; j++ )
+        {
+            for ( k = 0; k < reax->num_atom_types; k++ )
+            {
+                free( reax->fbp[i][j][k] );
+            }
+
+            free( reax->thbp[i][j] );
+            free( reax->hbp[i][j] );
+            free( reax->fbp[i][j] );
+        }
+
+        free( reax->tbp[i] );
+        free( reax->thbp[i] );
+        free( reax->hbp[i] );
+        free( reax->fbp[i] );
+    }
+
+    free( reax->sbp );
+    free( reax->tbp );
+    free( reax->thbp );
+    free( reax->hbp );
+    free( reax->fbp );
+
+    free( system->atoms );
+}
+
+
+void Finalize_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, output_controls *out_control )
+{
+}
+
+
+void Finalize_Workspace( reax_system *system, control_params *control,
+        static_storage *workspace )
+{
+    int i;
+
+    free( workspace->hbond_index );
+    free( workspace->total_bond_order );
+    free( workspace->Deltap );
+    free( workspace->Deltap_boc );
+    free( workspace->dDeltap_self );
+    free( workspace->Delta );
+    free( workspace->Delta_lp );
+    free( workspace->Delta_lp_temp );
+    free( workspace->dDelta_lp );
+    free( workspace->dDelta_lp_temp );
+    free( workspace->Delta_e );
+    free( workspace->Delta_boc );
+    free( workspace->nlp );
+    free( workspace->nlp_temp );
+    free( workspace->Clp );
+    free( workspace->CdDelta );
+    free( workspace->vlpex );
+
+    Deallocate_Matrix( workspace->H );
+    Deallocate_Matrix( workspace->H_sp );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Deallocate_Matrix( workspace->L );
+        Deallocate_Matrix( workspace->U );
+    }
+
+    for ( i = 0; i < 5; ++i )
+    {
+        free( workspace->s[i] );
+        free( workspace->t[i] );
+    }
+
+    free( workspace->Hdia_inv );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        free( workspace->droptol );
+    }
+    //TODO: check if unused
+    //free( workspace->w );
+    //TODO: check if unused
+    free( workspace->b );
+    free( workspace->b_s );
+    free( workspace->b_t );
+    free( workspace->b_prc );
+    free( workspace->b_prm );
+    free( workspace->s );
+    free( workspace->t );
+
+    switch ( control->cm_solver_type )
+    {
+        /* GMRES storage */
+        case GMRES_S:
+        case GMRES_H_S:
+            for ( i = 0; i < control->cm_solver_restart + 1; ++i )
+            {
+                free( workspace->h[i] );
+                free( workspace->rn[i] );
+                free( workspace->v[i] );
+            }
+
+            free( workspace->y );
+            free( workspace->z );
+            free( workspace->g );
+            free( workspace->h );
+            free( workspace->hs );
+            free( workspace->hc );
+            free( workspace->rn );
+            free( workspace->v );
+
+            free( workspace->r );
+            free( workspace->d );
+            free( workspace->q );
+            free( workspace->p );
+            break;
+
+        /* CG storage */
+        case CG_S:
+            free( workspace->r );
+            free( workspace->d );
+            free( workspace->q );
+            free( workspace->p );
+            break;
+
+        case SDM_S:
+            free( workspace->r );
+            free( workspace->d );
+            free( workspace->q );
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method linear solver type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
+    /* integrator storage */
+    free( workspace->a );
+    free( workspace->f_old );
+    free( workspace->v_const );
+
+#ifdef _OPENMP
+    free( workspace->f_local );
+#endif
+
+    /* storage for analysis */
+    if ( control->molec_anal || control->diffusion_coef )
+    {
+        free( workspace->mark );
+        free( workspace->old_mark );
+    }
+    else
+    {
+        free( workspace->mark );
+    }
+
+    if ( control->diffusion_coef )
+    {
+        free( workspace->x_old );
+    }
+    else
+    {
+        free( workspace->x_old );
+    }
+
+    free( workspace->orig_id );
+
+    /* space for keeping restriction info, if any */
+    if ( control->restrict_bonds )
+    {
+        for ( i = 0; i < system->N; ++i )
+        {
+            free( workspace->restricted_list[i] );
+        }
+
+        free( workspace->restricted );
+        free( workspace->restricted_list );
+    }
+
+#ifdef TEST_FORCES
+    free( workspace->dDelta );
+    free( workspace->f_ele );
+    free( workspace->f_vdw );
+    free( workspace->f_bo );
+    free( workspace->f_be );
+    free( workspace->f_lp );
+    free( workspace->f_ov );
+    free( workspace->f_un );
+    free( workspace->f_ang );
+    free( workspace->f_coa );
+    free( workspace->f_pen );
+    free( workspace->f_hb );
+    free( workspace->f_tor );
+    free( workspace->f_con );
+#endif
+}
+
+
+void Finalize_Lists( list **lists )
+{
+    Delete_List( TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS );
+    Delete_List( TYP_HBOND, (*lists) + HBONDS );
+    Delete_List( TYP_BOND, (*lists) + BONDS );
+    Delete_List( TYP_THREE_BODY, (*lists) + THREE_BODIES );
+
+#ifdef TEST_FORCES
+    Delete_List( TYP_DDELTA, (*lists) + DDELTA );
+    Delete_List( TYP_DBO, (*lists) + DBO );
+#endif
+}
+
+
+void Finalize_Out_Controls( reax_system *system, control_params *control,
+        static_storage *workspace, output_controls *out_control )
+{
+    /* close trajectory file */
+    if ( out_control->write_steps > 0 )
+    {
+        fclose( out_control->trj );
+    }
+
+    if ( out_control->energy_update_freq > 0 )
+    {
+        /* close out file */
+        fclose( out_control->out );
+
+        /* close potentials file */
+        fclose( out_control->pot );
+
+        /* close log file */
+        fclose( out_control->log );
+    }
+
+    /* close pressure file */
+    if ( control->ensemble == NPT ||
+            control->ensemble == iNPT ||
+            control->ensemble == sNPT )
+    {
+        fclose( out_control->prs );
+    }
+
+    /* close molecular analysis file */
+    if ( control->molec_anal )
+    {
+        fclose( out_control->mol );
+    }
+
+    /* close electric dipole moment analysis file */
+    if ( control->dipole_anal )
+    {
+        fclose( out_control->dpl );
+    }
+
+    /* close diffusion coef analysis file */
+    if ( control->diffusion_coef )
+    {
+        fclose( out_control->drft );
+    }
+
+
+#ifdef TEST_ENERGY
+    /* close bond energy file */
+    fclose( out_control->ebond );
+
+    /* close lone-pair energy file */
+    fclose( out_control->elp );
+
+    /* close overcoordination energy file */
+    fclose( out_control->eov );
+
+    /* close undercoordination energy file */
+    fclose( out_control->eun );
+
+    /* close angle energy file */
+    fclose( out_control->eval );
+
+    /* close penalty energy file */
+    fclose( out_control->epen );
+
+    /* close coalition energy file */
+    fclose( out_control->ecoa );
+
+    /* close hydrogen bond energy file */
+    fclose( out_control->ehb );
+
+    /* close torsion energy file */
+    fclose( out_control->etor );
+
+    /* close conjugation energy file */
+    fclose( out_control->econ );
+
+    /* close vdWaals energy file */
+    fclose( out_control->evdw );
+
+    /* close coulomb energy file */
+    fclose( out_control->ecou );
+#endif
+
+
+#ifdef TEST_FORCES
+    /* close bond orders file */
+    fclose( out_control->fbo );
+
+    /* close bond orders derivatives file */
+    fclose( out_control->fdbo );
+
+    /* close bond forces file */
+    fclose( out_control->fbond );
+
+    /* close lone-pair forces file */
+    fclose( out_control->flp );
+
+    /* close overcoordination forces file */
+    fclose( out_control->fatom );
+
+    /* close angle forces file */
+    fclose( out_control->f3body );
+
+    /* close hydrogen bond forces file */
+    fclose( out_control->fhb );
+
+    /* close torsion forces file */
+    fclose( out_control->f4body );
+
+    /* close nonbonded forces file */
+    fclose( out_control->fnonb );
+
+    /* close total force file */
+    fclose( out_control->ftot );
+
+    /* close coulomb forces file */
+    fclose( out_control->ftot2 );
+#endif
+}
+
+
+void Finalize( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
+{
+    if ( control->tabulate )
+    {
+//        Finalize_LR_Lookup_Table( system, control );
+    }
+
+    Finalize_Out_Controls( system, control, workspace, out_control );
+
+    Finalize_Lists( lists );
+
+    Finalize_Workspace( system, control, workspace );
+
+    Finalize_Simulation_Data( system, control, data, out_control );
+
+    Finalize_System( system, control, data );
+}
diff --git a/sPuReMD/src/init_md.h b/sPuReMD/src/init_md.h
index 3fc59053a9d569623d0004e7b5b9a86a6cbf3cb0..3d31189315ea7f69699067c79882685fbfffdbf1 100644
--- a/sPuReMD/src/init_md.h
+++ b/sPuReMD/src/init_md.h
@@ -24,7 +24,12 @@
 
 #include "mytypes.h"
 
+
 void Initialize( reax_system*, control_params*, simulation_data*,
-                 static_storage*, list**, output_controls*, evolve_function* );
+        static_storage*, list**, output_controls*, evolve_function* );
+
+void Finalize( reax_system*, control_params*, simulation_data*,
+        static_storage*, list**, output_controls* );
+
 
 #endif
diff --git a/sPuReMD/src/integrate.c b/sPuReMD/src/integrate.c
index 142863cad18291a04f4f56fc364c73cf8fdabd3b..7f42a36811bf1ad1eb8ff13adfadb6c3ef675f06 100644
--- a/sPuReMD/src/integrate.c
+++ b/sPuReMD/src/integrate.c
@@ -20,13 +20,14 @@
   ----------------------------------------------------------------------*/
 
 #include "integrate.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "forces.h"
 #include "grid.h"
 #include "neighbors.h"
 #include "print_utils.h"
-#include "QEq.h"
+#include "charges.h"
 #include "reset_utils.h"
 #include "restart.h"
 #include "system_props.h"
@@ -34,10 +35,9 @@
 #include "list.h"
 
 
-
 void Velocity_Verlet_NVE(reax_system* system, control_params* control,
-                         simulation_data *data, static_storage *workspace,
-                         list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i, steps, renbr;
     real inv_m, dt, dt_sqr;
@@ -47,6 +47,7 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
@@ -56,12 +57,13 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_ScaledSum( dx, dt, system->atoms[i].v,
-                        0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
         Inc_on_T3( system->atoms[i].x, dx, &( system->box ) );
 
         rvec_ScaledAdd( system->atoms[i].v,
-                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - ");
 #endif
@@ -69,16 +71,19 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
     if ( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace,
-                                 lists, out_control );
+                lists, out_control );
+    }
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
     for ( i = 0; i < system->N; i++ )
     {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         rvec_ScaledAdd( system->atoms[i].v,
-                        0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet2\n");
 #endif
@@ -86,11 +91,8 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
 
 
 
-void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
-        control_params* control,
-        simulation_data *data,
-        static_storage *workspace,
-        list **lists,
+void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, control_params* control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, itr, steps, renbr;
@@ -104,6 +106,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
 #endif
@@ -114,7 +117,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
         rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
-                        0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
+                0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
 
         Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
 
@@ -122,6 +125,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     }
     /* Compute xi(t + dt) */
     therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
 #endif
@@ -129,8 +133,10 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
     if ( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace,
-                                 lists, out_control );
+                lists, out_control );
+    }
     /* Calculate Forces at time (t + dt) */
     Compute_Forces( system, control, data, workspace, lists, out_control );
 
@@ -184,7 +190,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
                  itr, G_xi_new, v_xi_new, v_xi_old );
 #endif
     }
-    while ( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+    while ( FABS(v_xi_new - v_xi_old ) > 1e-5 );
 
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
@@ -483,7 +489,7 @@ void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
     fprintf(out_control->log, "nbrs-");
     fflush( out_control->log );
 
-    /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
+    /* Compute_Charges( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
@@ -589,7 +595,7 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
     fprintf(out_control->log, "nbrs-");
     fflush( out_control->log );
 
-    /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
+    /* Compute_Charges( system, control, workspace, lists[FAR_NBRS], out_control );
        fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
 
     Compute_Forces( system, control, data, workspace, lists, out_control );
@@ -666,7 +672,7 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
                  "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n",
                  itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
     }
-    while ( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
+    while ( FABS(v_eps_new - v_eps_old) + FABS(v_xi_new - v_xi_old) > 2e-3 );
 
 
     therm->v_xi_old = therm->v_xi;
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 4ef1ce222a5a423b25bb78baba90ccbb59c839b6..208791b8c297d0083489ca9faf837a746bdc0411 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -28,13 +28,6 @@
 #include "vector.h"
 
 
-typedef enum
-{
-    LOWER = 0,
-    UPPER = 1,
-} TRIANGULARITY;
-
-
 /* global to make OpenMP shared (Sparse_MatVec) */
 #ifdef _OPENMP
 real *b_local = NULL;
@@ -76,7 +69,7 @@ real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec( const sparse_matrix * const A,
-                           const real * const x, real * const b )
+        const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
@@ -154,7 +147,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
  * A: stored in CSR
  * A_t: stored in CSR
  */
-void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
+void Transpose( const sparse_matrix * const A, sparse_matrix const *A_t )
 {
     unsigned int i, j, pj, *A_t_top;
 
@@ -226,10 +219,10 @@ void Transpose_I( sparse_matrix * const A )
  * Hdia_inv: diagonal inverse preconditioner (constructed using H)
  * y: current residual
  * x: preconditioned residual
- * N: length of preconditioner and vectors (# rows in H)
+ * N: dimensions of preconditioner and vectors (# rows in H)
  */
 static void diag_pre_app( const real * const Hdia_inv, const real * const y,
-                          real * const x, const int N )
+        real * const x, const int N )
 {
     unsigned int i;
 
@@ -246,13 +239,14 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y,
  * LU: lower/upper triangular, stored in CSR
  * y: constants in linear system (RHS)
  * x: solution
+ * N: dimensions of matrix and vectors
  * tri: triangularity of LU (lower/upper)
  *
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
-static void tri_solve( const sparse_matrix * const LU, const real * const y,
-                       real * const x, const TRIANGULARITY tri )
+void tri_solve( const sparse_matrix * const LU, const real * const y,
+        real * const x, const int N, const TRIANGULARITY tri )
 {
     int i, pj, j, si, ei;
     real val;
@@ -261,7 +255,7 @@ static void tri_solve( const sparse_matrix * const LU, const real * const y,
     {
         if ( tri == LOWER )
         {
-            for ( i = 0; i < LU->n; ++i )
+            for ( i = 0; i < N; ++i )
             {
                 x[i] = y[i];
                 si = LU->start[i];
@@ -277,7 +271,7 @@ static void tri_solve( const sparse_matrix * const LU, const real * const y,
         }
         else
         {
-            for ( i = LU->n - 1; i >= 0; --i )
+            for ( i = N - 1; i >= 0; --i )
             {
                 x[i] = y[i];
                 si = LU->start[i];
@@ -300,14 +294,16 @@ static void tri_solve( const sparse_matrix * const LU, const real * const y,
  * LU: lower/upper triangular, stored in CSR
  * y: constants in linear system (RHS)
  * x: solution
+ * N: dimensions of matrix and vectors
  * tri: triangularity of LU (lower/upper)
  * find_levels: perform level search if positive, otherwise reuse existing levels
  *
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
-static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
-                                   real * const x, const TRIANGULARITY tri, int find_levels )
+void tri_solve_level_sched( const sparse_matrix * const LU,
+        const real * const y, real * const x, const int N,
+        const TRIANGULARITY tri, int find_levels )
 {
     int i, j, pj, local_row, local_level;
 
@@ -330,9 +326,9 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
 
         if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
         {
-            if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
-                    || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
-                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            if ( (row_levels = (unsigned int*) malloc((size_t)N * sizeof(unsigned int))) == NULL
+                    || (level_rows = (unsigned int*) malloc((size_t)N * sizeof(unsigned int))) == NULL
+                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(N + 1) * sizeof(unsigned int))) == NULL )
             {
                 fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
                 exit( INSUFFICIENT_MEMORY );
@@ -341,7 +337,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
 
         if ( top == NULL )
         {
-            if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+            if ( (top = (unsigned int*) malloc((size_t)(N + 1) * sizeof(unsigned int))) == NULL )
             {
                 fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
                 exit( INSUFFICIENT_MEMORY );
@@ -351,14 +347,14 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
         /* find levels (row dependencies in substitutions) */
         if ( find_levels == TRUE )
         {
-            memset( row_levels, 0, LU->n * sizeof(unsigned int) );
-            memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) );
-            memset( top, 0, LU->n * sizeof(unsigned int) );
+            memset( row_levels, 0, N * sizeof(unsigned int) );
+            memset( level_rows_cnt, 0, N * sizeof(unsigned int) );
+            memset( top, 0, N * sizeof(unsigned int) );
             levels = 1;
 
             if ( tri == LOWER )
             {
-                for ( i = 0; i < LU->n; ++i )
+                for ( i = 0; i < N; ++i )
                 {
                     local_level = 1;
                     for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
@@ -373,12 +369,12 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
 
 //#if defined(DEBUG)
                 fprintf(stderr, "levels(L): %d\n", levels);
-                fprintf(stderr, "NNZ(L): %d\n", LU->start[LU->n]);
+                fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
 //#endif
             }
             else
             {
-                for ( i = LU->n - 1; i >= 0; --i )
+                for ( i = N - 1; i >= 0; --i )
                 {
                     local_level = 1;
                     for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
@@ -393,7 +389,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
 
 //#if defined(DEBUG)
                 fprintf(stderr, "levels(U): %d\n", levels);
-                fprintf(stderr, "NNZ(U): %d\n", LU->start[LU->n]);
+                fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
 //#endif
             }
 
@@ -403,7 +399,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
                 top[i] = level_rows_cnt[i];
             }
 
-            for ( i = 0; i < LU->n; ++i )
+            for ( i = 0; i < N; ++i )
             {
                 level_rows[top[row_levels[i] - 1]] = i;
                 ++top[row_levels[i] - 1];
@@ -970,7 +966,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
  *
  * Note: Newmann series arises from series expansion of the inverse of
  * the coefficient matrix in the triangular system */
-static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
+void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
         const real * const b, real * const x, const TRIANGULARITY tri, const
         unsigned int maxiter )
 {
@@ -1075,164 +1071,167 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
  *   Matrices have non-zero diagonals
  *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
 static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
-                                  const real * const y, real * const x, const int fresh_pre )
+        const real * const y, real * const x, const int fresh_pre )
 {
     int i, si;
 
-    switch ( control->pre_app_type )
+    /* no preconditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
     {
-    case NONE_PA:
-        break;
-    case TRI_SOLVE_PA:
-        switch ( control->pre_comp_type )
-        {
-        case DIAG_PC:
-            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
-            break;
-        case ICHOLT_PC:
-        case ILU_PAR_PC:
-        case ILUT_PAR_PC:
-            tri_solve( workspace->L, y, x, LOWER );
-            tri_solve( workspace->U, x, x, UPPER );
-            break;
-        default:
-            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-            exit( INVALID_INPUT );
-            break;
-        }
-        break;
-    case TRI_SOLVE_LEVEL_SCHED_PA:
-        switch ( control->pre_comp_type )
-        {
-        case DIAG_PC:
-            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
-            break;
-        case ICHOLT_PC:
-        case ILU_PAR_PC:
-        case ILUT_PAR_PC:
-            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
-            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
-            break;
-        default:
-            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-            exit( INVALID_INPUT );
-            break;
-        }
-        break;
-    case TRI_SOLVE_GC_PA:
-        switch ( control->pre_comp_type )
+        Vector_Copy( x, y, workspace->H->n );
+    }
+    else
+    {
+        switch ( control->cm_solver_pre_app_type )
         {
-        case DIAG_PC:
-            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
-            exit( INVALID_INPUT );
+        case TRI_SOLVE_PA:
+            switch ( control->cm_solver_pre_comp_type )
+            {
+            case DIAG_PC:
+                diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+                break;
+            case ICHOLT_PC:
+            case ILU_PAR_PC:
+            case ILUT_PAR_PC:
+                tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
+                tri_solve( workspace->U, x, x, workspace->U->n, UPPER );
+                break;
+            default:
+                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
+            }
             break;
-        case ICHOLT_PC:
-        case ILU_PAR_PC:
-        case ILUT_PAR_PC:
-            #pragma omp master
+        case TRI_SOLVE_LEVEL_SCHED_PA:
+            switch ( control->cm_solver_pre_comp_type )
             {
-                memcpy( y_p, y, sizeof(real) * workspace->H->n );
+            case DIAG_PC:
+                diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+                break;
+            case ICHOLT_PC:
+            case ILU_PAR_PC:
+            case ILUT_PAR_PC:
+                tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
+                tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
+                break;
+            default:
+                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
             }
+            break;
+        case TRI_SOLVE_GC_PA:
+            switch ( control->cm_solver_pre_comp_type )
+            {
+            case DIAG_PC:
+                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
+            case ICHOLT_PC:
+            case ILU_PAR_PC:
+            case ILUT_PAR_PC:
+                #pragma omp master
+                {
+                    memcpy( y_p, y, sizeof(real) * workspace->H->n );
+                }
 
-            #pragma omp barrier
+                #pragma omp barrier
 
-            permute_vector( y_p, workspace->H->n, FALSE, LOWER );
-            tri_solve_level_sched( workspace->L, y_p, x, LOWER, fresh_pre );
-            tri_solve_level_sched( workspace->U, x, x, UPPER, fresh_pre );
-            permute_vector( x, workspace->H->n, TRUE, UPPER );
-        break;
-        default:
-            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-            exit( INVALID_INPUT );
+                permute_vector( y_p, workspace->H->n, FALSE, LOWER );
+                tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
+                tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
+                permute_vector( x, workspace->H->n, TRUE, UPPER );
             break;
-        }
-        break;
-    case JACOBI_ITER_PA:
-        switch ( control->pre_comp_type )
-        {
-        case DIAG_PC:
-            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
-            exit( INVALID_INPUT );
+            default:
+                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
+            }
             break;
-        case ICHOLT_PC:
-        case ILU_PAR_PC:
-        case ILUT_PAR_PC:
-            #pragma omp master
+        case JACOBI_ITER_PA:
+            switch ( control->cm_solver_pre_comp_type )
             {
-                if ( Dinv_L == NULL )
+            case DIAG_PC:
+                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
+            case ICHOLT_PC:
+            case ILU_PAR_PC:
+            case ILUT_PAR_PC:
+                #pragma omp master
                 {
-                    if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
+                    if ( Dinv_L == NULL )
                     {
-                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                        exit( INSUFFICIENT_MEMORY );
+                        if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
+                        {
+                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                            exit( INSUFFICIENT_MEMORY );
+                        }
                     }
                 }
-            }
 
-            #pragma omp barrier
+                #pragma omp barrier
 
-            /* construct D^{-1}_L */
-            if ( fresh_pre == TRUE )
-            {
-                #pragma omp for schedule(static)
-                for ( i = 0; i < workspace->L->n; ++i )
+                /* construct D^{-1}_L */
+                if ( fresh_pre == TRUE )
                 {
-                    si = workspace->L->start[i + 1] - 1;
-                    Dinv_L[i] = 1. / workspace->L->val[si];
+                    #pragma omp for schedule(static)
+                    for ( i = 0; i < workspace->L->n; ++i )
+                    {
+                        si = workspace->L->start[i + 1] - 1;
+                        Dinv_L[i] = 1. / workspace->L->val[si];
+                    }
                 }
-            }
 
-            jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );
+                jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
 
-            #pragma omp master
-            {
-                if ( Dinv_U == NULL )
+                #pragma omp master
                 {
-                    if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
+                    if ( Dinv_U == NULL )
                     {
-                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                        exit( INSUFFICIENT_MEMORY );
+                        if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
+                        {
+                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                            exit( INSUFFICIENT_MEMORY );
+                        }
                     }
                 }
-            }
 
-            #pragma omp barrier
+                #pragma omp barrier
 
-            /* construct D^{-1}_U */
-            if ( fresh_pre == TRUE )
-            {
-                #pragma omp for schedule(static)
-                for ( i = 0; i < workspace->U->n; ++i )
+                /* construct D^{-1}_U */
+                if ( fresh_pre == TRUE )
                 {
-                    si = workspace->U->start[i];
-                    Dinv_U[i] = 1. / workspace->U->val[si];
+                    #pragma omp for schedule(static)
+                    for ( i = 0; i < workspace->U->n; ++i )
+                    {
+                        si = workspace->U->start[i];
+                        Dinv_U[i] = 1. / workspace->U->val[si];
+                    }
                 }
-            }
 
-            jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
+                jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+                break;
+            default:
+                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                exit( INVALID_INPUT );
+                break;
+            }
             break;
         default:
             fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
             exit( INVALID_INPUT );
             break;
-        }
-        break;
-    default:
-        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-        exit( INVALID_INPUT );
-        break;
 
+        }
     }
-
-    return;
 }
 
 
 /* generalized minimual residual iterative solver for sparse linear systems */
 int GMRES( const static_storage * const workspace, const control_params * const control,
-           simulation_data * const data, const sparse_matrix * const H,
-           const real * const b, const real tol, real * const x,
-           const FILE * const fout, const int fresh_pre )
+        simulation_data * const data, const sparse_matrix * const H, const real * const b,
+        const real tol, real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N, g_j, g_itr;
     real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
@@ -1249,10 +1248,10 @@ int GMRES( const static_storage * const workspace, const control_params * const
         bnorm = Norm( b, N );
         #pragma omp master
         {
-            data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
         }
 
-        if ( control->pre_comp_type == DIAG_PC )
+        if ( control->cm_solver_pre_comp_type == DIAG_PC )
         {
             /* apply preconditioner to RHS */
             #pragma omp master
@@ -1262,12 +1261,12 @@ int GMRES( const static_storage * const workspace, const control_params * const
             apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre );
             #pragma omp master
             {
-                data->timing.pre_app += Get_Timing_Info( time_start );
+                data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
             }
         }
 
         /* GMRES outer-loop */
-        for ( itr = 0; itr < MAX_ITR; ++itr )
+        for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
         {
             /* calculate r0 */
             #pragma omp master
@@ -1277,10 +1276,10 @@ int GMRES( const static_storage * const workspace, const control_params * const
             Sparse_MatVec( H, x, workspace->b_prm );
             #pragma omp master
             {
-                data->timing.solver_spmv += Get_Timing_Info( time_start );
+                data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
             }
 
-            if ( control->pre_comp_type == DIAG_PC )
+            if ( control->cm_solver_pre_comp_type == DIAG_PC )
             {
                 #pragma omp master
                 {
@@ -1289,11 +1288,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE );
                 #pragma omp master
                 {
-                    data->timing.pre_app += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
                 }
             }
 
-            if ( control->pre_comp_type == DIAG_PC )
+            if ( control->cm_solver_pre_comp_type == DIAG_PC )
             {
                 #pragma omp master
                 {
@@ -1302,7 +1301,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N );
                 #pragma omp master
                 {
-                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
                 }
             }
             else
@@ -1314,11 +1313,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N );
                 #pragma omp master
                 {
-                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
                 }
             }
 
-            if ( control->pre_comp_type != DIAG_PC )
+            if ( control->cm_solver_pre_comp_type != DIAG_PC )
             {
                 #pragma omp master
                 {
@@ -1328,7 +1327,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                         itr == 0 ? fresh_pre : FALSE );
                 #pragma omp master
                 {
-                    data->timing.pre_app += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
                 }
             }
 
@@ -1344,11 +1343,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
             Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N );
             #pragma omp master
             {
-                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
             }
 
             /* GMRES inner-loop */
-            for ( j = 0; j < RESTART && FABS(workspace->g[j]) / bnorm > tol; j++ )
+            for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ )
             {
                 /* matvec */
                 #pragma omp master
@@ -1358,7 +1357,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] );
                 #pragma omp master
                 {
-                    data->timing.solver_spmv += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
                 }
 
                 #pragma omp master
@@ -1368,10 +1367,10 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], FALSE );
                 #pragma omp master
                 {
-                    data->timing.pre_app += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
                 }
 
-                if ( control->pre_comp_type == DIAG_PC )
+                if ( control->cm_solver_pre_comp_type == DIAG_PC )
                 {
                     /* apply modified Gram-Schmidt to orthogonalize the new residual */
                     #pragma omp master
@@ -1385,7 +1384,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     }
                     #pragma omp master
                     {
-                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
                     }
                 }
                 else
@@ -1412,7 +1411,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     }
                     #pragma omp master
                     {
-                        data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
                     }
                 }
 
@@ -1429,7 +1428,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                               1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
                 #pragma omp master
                 {
-                    data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
                 }
 #if defined(DEBUG)
                 fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
@@ -1438,7 +1437,8 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 #pragma omp master
                 {
                     time_start = Get_Time( );
-                    if ( control->pre_comp_type == DIAG_PC )
+                    if ( control->cm_solver_pre_comp_type == NONE_PC ||
+                            control->cm_solver_pre_comp_type == DIAG_PC )
                     {
                         /* Givens rotations on the upper-Hessenberg matrix to make it U */
                         for ( i = 0; i <= j; i++ )
@@ -1487,7 +1487,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     tmp2 = -workspace->hs[j] * workspace->g[j];
                     workspace->g[j] = tmp1;
                     workspace->g[j + 1] = tmp2;
-                    data->timing.solver_orthog += Get_Timing_Info( time_start );
+                    data->timing.cm_solver_orthog += Get_Timing_Info( time_start );
                 }
 
                 #pragma omp barrier
@@ -1513,7 +1513,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
 
                     workspace->y[i] = temp / workspace->h[i][i];
                 }
-                data->timing.solver_tri_solve += Get_Timing_Info( time_start );
+                data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start );
 
                 /* update x = x_0 + Vy */
                 time_start = Get_Time( );
@@ -1527,7 +1527,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
             Vector_Add( x, 1., workspace->p, N );
             #pragma omp master
             {
-                data->timing.solver_vector_ops += Get_Timing_Info( time_start );
+                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
             }
 
             /* stopping condition */
@@ -1544,38 +1544,25 @@ int GMRES( const static_storage * const workspace, const control_params * const
         }
     }
 
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n",
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.solver_iters += itr * RESTART + j;
-
-    if ( g_itr >= MAX_ITR )
+    if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return g_itr * (RESTART + 1) + g_j + 1;
+        return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
     }
 
-    return g_itr * (RESTART + 1) + g_j + 1;
+    return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
 }
 
 
-int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control,
-                       simulation_data * const data, const sparse_matrix * const H,
-                       const real * const b, real tol, real * const x,
-                       const FILE * const fout, const int fresh_pre )
+int GMRES_HouseHolder( const static_storage * const workspace,
+        const control_params * const control, simulation_data * const data,
+        const sparse_matrix * const H, const real * const b, real tol,
+        real * const x, const int fresh_pre )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
-    real u[RESTART + 2][10000];
+    real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2];
+    real u[control->cm_solver_restart + 2][10000];
 
     N = H->n;
     bnorm = Norm( b, N );
@@ -1589,7 +1576,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
     // memset( x, 0, sizeof(real) * N );
 
     /* GMRES outer-loop */
-    for ( itr = 0; itr < MAX_ITR; ++itr )
+    for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
     {
         /* compute z = r0 */
         Sparse_MatVec( H, x, workspace->b_prm );
@@ -1599,7 +1586,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
         }
         Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
 
-        Vector_MakeZero( w, RESTART + 1 );
+        Vector_MakeZero( w, control->cm_solver_restart + 1 );
         w[0] = Norm( z[0], N );
 
         Vector_Copy( u[0], z[0], N );
@@ -1610,7 +1597,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
         // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
         /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+        for ( j = 0; j < control->cm_solver_restart && fabs( w[j] ) / bnorm > tol; j++ )
         {
             /* compute v_j */
             Vector_Scale( z[j], -2 * u[j][j], u[j], N );
@@ -1714,7 +1701,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
         }
 
         // fprintf( stderr, "y: " );
-        // for( i = 0; i < RESTART+1; ++i )
+        // for( i = 0; i < control->cm_solver_restart+1; ++i )
         //   fprintf( stderr, "%8.3f ", workspace->y[i] );
 
 
@@ -1742,10 +1729,6 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
             Vector_Add( x, workspace->y[i], z[i], N );
         }
 
-        // fprintf( stderr, "\nx_aft: " );
-        // for( i = 0; i < N; ++i )
-        //   fprintf( stderr, "%6.2f ", x[i] );
-
         /* stopping condition */
         if ( fabs( w[j] ) / bnorm <= tol )
         {
@@ -1753,197 +1736,135 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par
         }
     }
 
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );
-
-    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n",
-    //         itr, j, fabs( workspace->g[j] ) / bnorm );
-
-    if ( itr >= MAX_ITR )
+    if ( itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART + 1) + j + 1;
+        return itr * (control->cm_solver_restart + 1) + j + 1;
     }
 
-    return itr * (RESTART + 1) + j + 1;
+    return itr * (control->cm_solver_restart + 1) + j + 1;
 }
 
 
-/* Preconditioned Conjugate Gradient */
-int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
-         sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout )
+/* Conjugate Gradient */
+int CG( const static_storage * const workspace, const control_params * const control,
+        const sparse_matrix * const H, const real * const b, const real tol,
+        real * const x, const int fresh_pre )
 {
-    int  i, N;
+    int i, itr, N;
     real tmp, alpha, beta, b_norm, r_norm;
-    real sig0, sig_old, sig_new;
-
-    N = A->n;
-    b_norm = Norm( b, N );
-    //fprintf( stderr, "b_norm: %.15e\n", b_norm );
-
-    Sparse_MatVec( A, x, workspace->q );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    r_norm = Norm(workspace->r, N);
-    //Print_Soln( workspace, x, q, b, N );
-    //fprintf( stderr, "res: %.15e\n", r_norm );
+    real *d, *r, *p, *z;
+    real sig_old, sig_new;
 
-    tri_solve( L, workspace->r, workspace->d, LOWER );
-    tri_solve( U, workspace->d, workspace->p, UPPER );
-    sig_new = Dot( workspace->r, workspace->p, N );
-    sig0 = sig_new;
+    N = H->n;
+    d = workspace->d;
+    r = workspace->r;
+    p = workspace->q;
+    z = workspace->p;
 
-    for ( i = 0; i < 200 && r_norm / b_norm > tol; ++i )
+    #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \
+        shared(itr, N, d, r, p, z)
     {
-        //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
-        Sparse_MatVec( A, workspace->p, workspace->q );
-        tmp = Dot( workspace->q, workspace->p, N );
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->p, N );
-        //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-        //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
-
-        Vector_Add( workspace->r, -alpha, workspace->q, N );
-        r_norm = Norm(workspace->r, N);
-        //fprintf( stderr, "res: %.15e\n", r_norm );
-
-        tri_solve( L, workspace->r, workspace->d, LOWER );
-        tri_solve( U, workspace->d, workspace->d, UPPER );
-        sig_old = sig_new;
-        sig_new = Dot( workspace->r, workspace->d, N );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
-    }
+        b_norm = Norm( b, N );
 
-    //fprintf( fout, "CG took %d iterations\n", i );
-    if ( i >= 200 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
-    }
+        Sparse_MatVec( H, x, d );
+        Vector_Sum( r, 1.0,  b, -1.0, d, N );
+        r_norm = Norm( r, N );
 
-    return i;
-}
+        apply_preconditioner( workspace, control, r, z, fresh_pre );
+        Vector_Copy( p, z, N );
 
+        sig_new = Dot( r, z, N );
 
-/* Conjugate Gradient */
-int CG( static_storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, FILE *fout )
-{
-    int  i, j, N;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
+        for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
+        {
+            Sparse_MatVec( H, p, d );
 
-    N = H->n;
-    b_norm = Norm( b, N );
-    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+            tmp = Dot( d, p, N );
+            alpha = sig_new / tmp;
+            Vector_Add( x, alpha, p, N );
 
-    Sparse_MatVec( H, x, workspace->q );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for ( j = 0; j < N; ++j )
-    {
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-    }
+            Vector_Add( r, -alpha, d, N );
+            r_norm = Norm( r, N );
 
-    sig_new = Dot( workspace->r, workspace->d, N );
-    sig0 = sig_new;
-    //Print_Soln( workspace, x, q, b, N );
-    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n",
-    // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
-    //fprintf( stderr, "sig_new: %f\n", sig_new );
+            apply_preconditioner( workspace, control, r, z, FALSE );
 
-    for ( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
-    {
-        //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
-        Sparse_MatVec( H, workspace->d, workspace->q );
-        tmp = Dot( workspace->d, workspace->q, N );
-        //fprintf( stderr, "tmp: %f\n", tmp );
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->d, N );
-        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-
-        Vector_Add( workspace->r, -alpha, workspace->q, N );
-        for ( j = 0; j < N; ++j )
-        {
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            sig_old = sig_new;
+            sig_new = Dot( r, z, N );
+
+            beta = sig_new / sig_old;
+            Vector_Sum( p, 1., z, beta, p, N );
         }
 
-        sig_old = sig_new;
-        sig_new = Dot( workspace->r, workspace->p, N );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
-        //fprintf( stderr, "sig_new: %f\n", sig_new );
+        #pragma omp single
+        itr = i;
     }
 
-    fprintf( stderr, "CG took %d iterations\n", i );
-
-    if ( i >= 300 )
+    if ( itr >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
+        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", itr );
+        return itr;
     }
 
-    return i;
+    return itr;
 }
 
 
 /* Steepest Descent */
-int SDM( static_storage *workspace, sparse_matrix *H,
-         real *b, real tol, real *x, FILE *fout )
+int SDM( const static_storage * const workspace, const control_params * const control,
+        const sparse_matrix * const H, const real * const b, const real tol,
+        real * const x, const int fresh_pre )
 {
-    int  i, j, N;
-    real tmp, alpha, beta, b_norm;
-    real sig0, sig;
+    int i, itr, N;
+    real tmp, alpha, b_norm;
+    real sig;
 
     N = H->n;
-    b_norm = Norm( b, N );
-    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
 
-    Sparse_MatVec( H, x, workspace->q );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-    for ( j = 0; j < N; ++j )
+    #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \
+        shared(itr, N)
     {
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-    }
+        b_norm = Norm( b, N );
 
-    sig = Dot( workspace->r, workspace->d, N );
-    sig0 = sig;
+        Sparse_MatVec( H, x, workspace->q );
+        Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, N );
 
-    for ( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i )
-    {
-        Sparse_MatVec( H, workspace->d, workspace->q );
+        apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre );
 
         sig = Dot( workspace->r, workspace->d, N );
-        tmp = Dot( workspace->d, workspace->q, N );
-        alpha = sig / tmp;
 
-        Vector_Add( x, alpha, workspace->d, N );
-        Vector_Add( workspace->r, -alpha, workspace->q, N );
-        for ( j = 0; j < N; ++j )
+        for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
         {
-            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            Sparse_MatVec( H, workspace->d, workspace->q );
+
+            sig = Dot( workspace->r, workspace->d, N );
+
+            /* ensure each thread gets a local copy of
+             * the function return value
+             * (which is stored as global inside the function)
+             * before proceeding */
+            #pragma omp barrier
+
+            tmp = Dot( workspace->d, workspace->q, N );
+            alpha = sig / tmp;
+
+            Vector_Add( x, alpha, workspace->d, N );
+            Vector_Add( workspace->r, -alpha, workspace->q, N );
+
+            apply_preconditioner( workspace, control, workspace->r, workspace->d, FALSE );
         }
 
-        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+        #pragma omp single
+        itr = i;
     }
 
-    fprintf( stderr, "SDM took %d iterations\n", i );
-
-    if ( i >= 300 )
+    if ( itr >= control->cm_solver_max_iters  )
     {
-        fprintf( stderr, "SDM convergence failed!\n" );
-        return i;
+        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", itr );
+        return itr;
     }
 
-    return i;
+    return itr;
 }
 
 
@@ -1971,8 +1892,8 @@ real condest( const sparse_matrix * const L, const sparse_matrix * const U )
 
     memset( e, 1., N * sizeof(real) );
 
-    tri_solve( L, e, e, LOWER );
-    tri_solve( U, e, e, UPPER );
+    tri_solve( L, e, e, L->n, LOWER );
+    tri_solve( U, e, e, U->n, UPPER );
 
     /* compute 1-norm of vector e */
     c = FABS(e[0]);
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index fe2d644cae630be6414944989b52adae8a6e1d61..a3148514449e17fa972c4fee57be8ea518c3c100 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -19,33 +19,54 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __GMRES_H_
-#define __GMRES_H_
+#ifndef __LIN_ALG_H_
+#define __LIN_ALG_H_
 
 #include "mytypes.h"
 
+typedef enum
+{
+    LOWER = 0,
+    UPPER = 1,
+} TRIANGULARITY;
+
+
+void Transpose( const sparse_matrix * const, sparse_matrix const * );
 
-void Transpose( const sparse_matrix const *, sparse_matrix const * );
 void Transpose_I( sparse_matrix * const );
 
+void tri_solve( const sparse_matrix * const, const real * const,
+        real * const, const int, const TRIANGULARITY );
+
+void tri_solve_level_sched( const sparse_matrix * const,
+        const real * const, real * const, const int,
+        const TRIANGULARITY, int );
+
+void jacobi_iter( const sparse_matrix * const, const real * const,
+        const real * const, real * const, const TRIANGULARITY,
+        const unsigned int );
+
 sparse_matrix * setup_graph_coloring( sparse_matrix * const );
 
 int GMRES( const static_storage * const, const control_params * const,
         simulation_data * const, const sparse_matrix * const,
         const real * const, const real, real * const,
-        const FILE * const, const int );
+        const int );
 
 int GMRES_HouseHolder( const static_storage * const, const control_params * const,
         simulation_data * const, const sparse_matrix * const,
         const real * const, const real, real * const,
-        const FILE * const, const int );
+        const int );
 
-int CG( static_storage*, sparse_matrix*,
-        real*, real, real*, FILE* );
+int CG( const static_storage * const, const control_params * const,
+        const sparse_matrix * const, const real * const, const real,
+        real * const, const int );
 
-int SDM( static_storage*, sparse_matrix*,
-         real*, real, real*, FILE* );
+int SDM( const static_storage * const, const control_params * const,
+        const sparse_matrix * const, const real * const, const real,
+        real * const, const int );
 
 real condest( const sparse_matrix * const, const sparse_matrix * const );
 
+
 #endif
diff --git a/sPuReMD/src/list.c b/sPuReMD/src/list.c
index f9044a6cc9e5ea7b625794c14c079582a4bc742b..e02b22fb1f7391d12ad79df2ec7d05ad4ae03d21 100644
--- a/sPuReMD/src/list.c
+++ b/sPuReMD/src/list.c
@@ -21,7 +21,8 @@
 
 #include "list.h"
 
-char Make_List(int n, int num_intrs, int type, list* l)
+
+char Make_List( int n, int num_intrs, int type, list* l )
 {
     char success = 1;
 
@@ -31,10 +32,16 @@ char Make_List(int n, int num_intrs, int type, list* l)
     l->index = (int*) malloc( n * sizeof(int) );
     l->end_index = (int*) malloc( n * sizeof(int) );
 
-    if (l->index == NULL) success = 0;
-    if (l->end_index == NULL) success = 0;
+    if ( l->index == NULL )
+    {
+        success = 0;
+    }
+    if ( l->end_index == NULL )
+    {
+        success = 0;
+    }
 
-    switch (type)
+    switch ( type )
     {
     case TYP_VOID:
         l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
@@ -86,7 +93,6 @@ char Make_List(int n, int num_intrs, int type, list* l)
     default:
         l->select.v = (void *) malloc(l->num_intrs * sizeof(void));
         if (l->select.v == NULL) success = 0;
-        l->type = TYP_VOID;
         break;
     }
 
@@ -94,46 +100,66 @@ char Make_List(int n, int num_intrs, int type, list* l)
 }
 
 
-void Delete_List(list* l)
+void Delete_List( int type, list* l )
 {
     if ( l->index != NULL )
-        free(l->index);
+    {
+        free( l->index );
+    }
     if ( l->end_index != NULL )
-        free(l->end_index);
+    {
+        free( l->end_index );
+    }
 
-    switch (l->type)
+    switch ( type )
     {
     case TYP_VOID:
         if ( l->select.v != NULL )
-            free(l->select.v);
+        {
+            free( l->select.v );
+        }
         break;
     case TYP_THREE_BODY:
         if ( l->select.three_body_list != NULL )
-            free(l->select.three_body_list);
+        {
+            free( l->select.three_body_list );
+        }
         break;
     case TYP_BOND:
         if ( l->select.bond_list != NULL )
-            free(l->select.bond_list);
+        {
+            free( l->select.bond_list );
+        }
         break;
     case TYP_DBO:
         if ( l->select.dbo_list != NULL )
-            free(l->select.dbo_list);
+        {
+            free( l->select.dbo_list );
+        }
         break;
     case TYP_DDELTA:
         if ( l->select.dDelta_list != NULL )
-            free(l->select.dDelta_list);
+        {
+            free( l->select.dDelta_list );
+        }
         break;
     case TYP_FAR_NEIGHBOR:
         if ( l->select.far_nbr_list != NULL )
-            free(l->select.far_nbr_list);
+        {
+            free( l->select.far_nbr_list );
+        }
         break;
     case TYP_NEAR_NEIGHBOR:
         if ( l->select.near_nbr_list != NULL )
-            free(l->select.near_nbr_list);
+        {
+            free( l->select.near_nbr_list );
+        }
         break;
     case TYP_HBOND:
         if ( l->select.hbond_list != NULL )
-            free(l->select.hbond_list);
+        {
+            free( l->select.hbond_list );
+        }
         break;
 
     default:
@@ -143,27 +169,32 @@ void Delete_List(list* l)
 
 }
 
-inline int Num_Entries(int i, list* l)
+
+inline int Num_Entries( int i, list* l )
 {
     return l->end_index[i] - l->index[i];
 }
 
-inline int Start_Index(int i, list *l )
+
+inline int Start_Index( int i, list *l )
 {
     return l->index[i];
 }
 
+
 inline int End_Index( int i, list *l )
 {
     return l->end_index[i];
 }
 
-inline void Set_Start_Index(int i, int val, list *l)
+
+inline void Set_Start_Index( int i, int val, list *l )
 {
     l->index[i] = val;
 }
 
-inline void Set_End_Index(int i, int val, list *l)
+
+inline void Set_End_Index( int i, int val, list *l )
 {
     l->end_index[i] = val;
 }
diff --git a/sPuReMD/src/list.h b/sPuReMD/src/list.h
index e3ecc584e865c419747886c590e15dc0709feccc..35a4b1debf03a02e3c6812320814a61a7d6db3cc 100644
--- a/sPuReMD/src/list.h
+++ b/sPuReMD/src/list.h
@@ -24,14 +24,20 @@
 
 #include "mytypes.h"
 
+
 char Make_List( int, int, int, list* );
-void Delete_List( list* );
 
-int  Num_Entries(int, list*);
-int  Start_Index( int, list* );
-int  End_Index( int, list* );
+void Delete_List( int, list* );
+
+int Num_Entries( int, list* );
+
+int Start_Index( int, list* );
+
+int End_Index( int, list* );
+
+void Set_Start_Index( int, int, list* );
+
+void Set_End_Index( int, int, list* );
 
-void Set_Start_Index(int, int, list*);
-void Set_End_Index(int, int, list*);
 
 #endif
diff --git a/sPuReMD/src/lookup.c b/sPuReMD/src/lookup.c
index 2ea39e3cd37f045153623bcf2616b56a0a02c526..973ba5faa22c0bd8bc7937416b3bf19beade0a1d 100644
--- a/sPuReMD/src/lookup.c
+++ b/sPuReMD/src/lookup.c
@@ -20,10 +20,12 @@
   ----------------------------------------------------------------------*/
 
 #include "lookup.h"
+
 #include "two_body_interactions.h"
 
-void Make_Lookup_Table(real xmin, real xmax, int n,
-        lookup_function f, lookup_table* t)
+
+void Make_Lookup_Table( real xmin, real xmax, int n,
+        lookup_function f, lookup_table* t )
 {
     int i;
 
@@ -36,7 +38,9 @@ void Make_Lookup_Table(real xmin, real xmax, int n,
     t->y = (real*) malloc(n * sizeof(real));
 
     for (i = 0; i < n; i++)
+    {
         t->y[i] = f(i * t->dx + t->xmin);
+    }
 
     // fprintf(stdout,"dx = %lf\n",t->dx);
     // for(i=0; i < n; i++)
@@ -363,20 +367,20 @@ void Make_LR_Lookup_Table( reax_system *system, control_params *control )
      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
 
-     evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
-     evdw_relerr = fabs(evdw_abserr / y.e_vdW);
-     fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
-     fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
-     eele_abserr = fabs(y.e_ele - y_spline.e_ele);
-     eele_relerr = fabs(eele_abserr / y.e_ele);
-     fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
-     fele_relerr = fabs(fele_abserr / y.CEclmb);
+     evdw_abserr = FABS(y.e_vdW - y_spline.e_vdW);
+     evdw_relerr = FABS(evdw_abserr / y.e_vdW);
+     fvdw_abserr = FABS(y.CEvd - y_spline.CEvd);
+     fvdw_relerr = FABS(fvdw_abserr / y.CEvd);
+     eele_abserr = FABS(y.e_ele - y_spline.e_ele);
+     eele_relerr = FABS(eele_abserr / y.e_ele);
+     fele_abserr = FABS(y.CEclmb - y_spline.CEclmb);
+     fele_relerr = FABS(fele_abserr / y.CEclmb);
 
      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
      fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
      fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
      y.H, y_spline.H,
-     fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );
+     FABS(y.H-y_spline.H), FABS((y.H-y_spline.H)/y.H) );
 
      fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
      y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr );
diff --git a/sPuReMD/src/lookup.h b/sPuReMD/src/lookup.h
index a0b9e51620716aeae13ca7743f554f3704c01110..bb1ba46865d369f58839d5a801905a1b5406130f 100644
--- a/sPuReMD/src/lookup.h
+++ b/sPuReMD/src/lookup.h
@@ -24,10 +24,18 @@
 
 #include "mytypes.h"
 
+
+/* Function pointer definitions */
+typedef real (*lookup_function)(real);
+
+
 void Make_Lookup_Table( real, real, int, lookup_function, lookup_table* );
-int  Lookup_Index_Of( real, lookup_table* );
+
+int Lookup_Index_Of( real, lookup_table* );
+
 real Lookup( real, lookup_table* );
 
 void Make_LR_Lookup_Table( reax_system*, control_params* );
 
+
 #endif
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index 69441bb72b7499b663a506dc8c95d40e56473a78..91f411e27e319ca969a064891d281d20e7e26ed9 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -23,8 +23,8 @@
 #define __MYTYPES_H_
 
 #if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_))
-#define __CONFIG_H_
-#include "config.h"
+  #define __CONFIG_H_
+  #include "config.h"
 #endif
 
 #include "math.h"
@@ -37,7 +37,7 @@
 #include "zlib.h"
 
 #ifdef _OPENMP
-#include <omp.h>
+  #include <omp.h>
 #endif
 
 //#define DEBUG_FOCUS
@@ -51,6 +51,7 @@
 #define TRUE  1
 #define FALSE 0
 
+#define LOG    log
 #define EXP    exp
 #define SQRT   sqrt
 #define POW    pow
@@ -58,6 +59,8 @@
 #define COS    cos
 #define SIN    sin
 #define TAN    tan
+#define CEIL   ceil
+#define FLOOR  floor
 #define FABS   fabs
 #define FMOD   fmod
 
@@ -71,10 +74,10 @@
 /* NaN IEEE 754 representation for C99 in math.h
  * Note: function choice must match REAL typedef below */
 #ifdef NAN
-#define IS_NAN_REAL(a) (isnan(a))
+  #define IS_NAN_REAL(a) (isnan(a))
 #else
-#warn "No support for NaN"
-#define NAN_REAL(a) (0)
+  #warn "No support for NaN"
+  #define NAN_REAL(a) (0)
 #endif
 
 #define PI            3.14159265
@@ -113,9 +116,6 @@
 #define MAX_dT              4.00
 #define MIN_dT              0.00
 
-#define MAX_ITR             10
-#define RESTART             50
-
 #define ZERO           0.000000000000000e+00
 #define ALMOST_ZERO    1e-10
 #define NEG_INF       -1e10
@@ -131,27 +131,42 @@
 #define LOOSE_ZONE  0.75
 
 
-typedef double real;
-typedef real rvec[3];
-typedef int  ivec[3];
-typedef real rtensor[3][3];
-
 /* config params */
 enum ensemble
 {
-    NVE = 0, NVT = 1, NPT = 2, sNPT = 3, iNPT = 4, ensNR = 5, bNVT = 6,
+    NVE = 0,
+    NVT = 1,
+    NPT = 2,
+    sNPT = 3,
+    iNPT = 4,
+    ensNR = 5,
+    bNVT = 6,
 };
 
 enum interaction_list_offets
 {
-    FAR_NBRS = 0, NEAR_NBRS = 1, THREE_BODIES = 2, BONDS = 3, OLD_BONDS = 4,
-    HBONDS = 5, DBO = 6, DDELTA = 7, LIST_N = 8,
+    FAR_NBRS = 0,
+    NEAR_NBRS = 1,
+    THREE_BODIES = 2,
+    BONDS = 3,
+    OLD_BONDS = 4,
+    HBONDS = 5,
+    DBO = 6,
+    DDELTA = 7,
+    LIST_N = 8,
 };
 
 enum interaction_type
 {
-    TYP_VOID = 0, TYP_THREE_BODY = 1, TYP_BOND = 2, TYP_HBOND = 3, TYP_DBO = 4,
-    TYP_DDELTA = 5, TYP_FAR_NEIGHBOR = 6, TYP_NEAR_NEIGHBOR = 7, TYP_N = 8,
+    TYP_VOID = 0,
+    TYP_THREE_BODY = 1,
+    TYP_BOND = 2,
+    TYP_HBOND = 3,
+    TYP_DBO = 4,
+    TYP_DDELTA = 5,
+    TYP_FAR_NEIGHBOR = 6,
+    TYP_NEAR_NEIGHBOR = 7,
+    TYP_N = 8,
 };
 
 enum errors
@@ -168,91 +183,112 @@ enum errors
     RUNTIME_ERROR = -19,
 };
 
-enum atoms
-{
-    C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
-    S_ATOM = 4, SI_ATOM = 5, GE_ATOM = 6, X_ATOM = 7,
-};
-
-enum molecule_type
-{
-    UNKNOWN = 0, WATER = 1,
-};
-
 enum molecular_analysis_type
 {
-    NO_ANALYSIS = 0, FRAGMENTS = 1, REACTIONS = 2, NUM_ANALYSIS = 3,
+    NO_ANALYSIS = 0,
+    FRAGMENTS = 1,
+    REACTIONS = 2,
+    NUM_ANALYSIS = 3,
 };
 
 enum restart_format
 {
-    WRITE_ASCII = 0, WRITE_BINARY = 1, RF_N = 2,
+    WRITE_ASCII = 0,
+    WRITE_BINARY = 1,
+    RF_N = 2,
 };
 
 enum geo_formats
 {
-    CUSTOM = 0, PDB = 1, BGF = 2, ASCII_RESTART = 3, BINARY_RESTART = 4, GF_N = 5,
+    CUSTOM = 0,
+    PDB = 1,
+    BGF = 2,
+    ASCII_RESTART = 3,
+    BINARY_RESTART = 4,
+    GF_N = 5,
+};
+
+enum charge_method
+{
+    QEQ_CM = 0,
+    EE_CM = 1,
+    ACKS2_CM = 2,
 };
 
 enum solver
 {
-    GMRES_S = 0, GMRES_H_S = 1, CG_S = 2, SDM_S = 3,
+    GMRES_S = 0,
+    GMRES_H_S = 1,
+    CG_S = 2,
+    SDM_S = 3,
 };
 
 enum pre_comp
 {
-    DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILUT_PAR_PC = 3, ILU_SUPERLU_MT_PC = 4,
+    NONE_PC = 0,
+    DIAG_PC = 1,
+    ICHOLT_PC = 2,
+    ILU_PAR_PC = 3,
+    ILUT_PAR_PC = 4,
+    ILU_SUPERLU_MT_PC = 5,
 };
 
 enum pre_app
 {
-    NONE_PA = 0, TRI_SOLVE_PA = 1, TRI_SOLVE_LEVEL_SCHED_PA = 2, TRI_SOLVE_GC_PA = 3, JACOBI_ITER_PA = 4,
+    TRI_SOLVE_PA = 0,
+    TRI_SOLVE_LEVEL_SCHED_PA = 1,
+    TRI_SOLVE_GC_PA = 2,
+    JACOBI_ITER_PA = 3,
 };
 
 
-/* Global params mapping */
-/*
-l[0]  = p_boc1
-l[1]  = p_boc2
-l[2]  = p_coa2
-l[3]  = N/A
-l[4]  = N/A
-l[5]  = N/A
-l[6]  = p_ovun6
-l[7]  = N/A
-l[8]  = p_ovun7
-l[9]  = p_ovun8
-l[10] = N/A
-l[11] = N/A
-l[12] = N/A
-l[13] = N/A
-l[14] = p_val6
-l[15] = p_lp1
-l[16] = p_val9
-l[17] = p_val10
-l[18] = N/A
-l[19] = p_pen2
-l[20] = p_pen3
-l[21] = p_pen4
-l[22] = N/A
-l[23] = p_tor2
-l[24] = p_tor3
-l[25] = p_tor4
-l[26] = N/A
-l[27] = p_cot2
-l[28] = p_vdW1
-l[29] = v_par30
-l[30] = p_coa4
-l[31] = p_ovun4
-l[32] = p_ovun3
-l[33] = p_val8
-l[34] = N/A
-l[35] = N/A
-l[36] = N/A
-l[37] = version number
-l[38] = p_coa3
-*/
+typedef double real;
+typedef real rvec[3];
+typedef int ivec[3];
+typedef real rtensor[3][3];
 
+
+/* Force field global params mapping:
+ *
+ * l[0]  = p_boc1
+ * l[1]  = p_boc2
+ * l[2]  = p_coa2
+ * l[3]  = N/A
+ * l[4]  = N/A
+ * l[5]  = N/A
+ * l[6]  = p_ovun6
+ * l[7]  = N/A
+ * l[8]  = p_ovun7
+ * l[9]  = p_ovun8
+ * l[10] = N/A
+ * l[11] = N/A
+ * l[12] = N/A
+ * l[13] = N/A
+ * l[14] = p_val6
+ * l[15] = p_lp1
+ * l[16] = p_val9
+ * l[17] = p_val10
+ * l[18] = N/A
+ * l[19] = p_pen2
+ * l[20] = p_pen3
+ * l[21] = p_pen4
+ * l[22] = N/A
+ * l[23] = p_tor2
+ * l[24] = p_tor3
+ * l[25] = p_tor4
+ * l[26] = N/A
+ * l[27] = p_cot2
+ * l[28] = p_vdW1
+ * l[29] = v_par30
+ * l[30] = p_coa4
+ * l[31] = p_ovun4
+ * l[32] = p_ovun3
+ * l[33] = p_val8
+ * l[34] = ACKS2 bond softness
+ * l[35] = N/A
+ * l[36] = N/A
+ * l[37] = version number
+ * l[38] = p_coa3 */
 typedef struct
 {
     int n_global;
@@ -261,7 +297,6 @@ typedef struct
 } global_parameters;
 
 
-
 typedef struct
 {
     /* Line one in field file */
@@ -284,8 +319,9 @@ typedef struct
     real p_ovun5;
     real chi;
     real eta;
-    int  p_hbond; /* Determines whether this type of atom participates in H_bonds.
-           It is 1 for donor H, 2 for acceptors (O,S,N), 0 for others*/
+    /* Determines whether this type of atom participates in H_bonds.
+     * It is 1 for donor H, 2 for acceptors (O,S,N), 0 for others*/
+    int p_hbond;
 
     /* Line three in field file */
     real r_pi_pi;
@@ -293,6 +329,8 @@ typedef struct
     real b_o_131;
     real b_o_132;
     real b_o_133;
+    /* bond softness for ACKS2 */
+    real b_s_acks2;
 
     /* Line four in the field file */
     real p_ovun2;
@@ -305,18 +343,29 @@ typedef struct
 } single_body_parameters;
 
 
-
 /* Two Body Parameters */
 typedef struct
 {
     /* Bond Order parameters */
-    real p_bo1, p_bo2, p_bo3, p_bo4, p_bo5, p_bo6;
-    real r_s, r_p, r_pp;  /* r_o distances in BO formula */
-    real p_boc3, p_boc4, p_boc5;
+    real p_bo1;
+    real p_bo2;
+    real p_bo3;
+    real p_bo4;
+    real p_bo5;
+    real p_bo6;
+    real r_s;
+    real r_p;
+    real r_pp;  /* r_o distances in BO formula */
+    real p_boc3;
+    real p_boc4;
+    real p_boc5;
 
     /* Bond Energy parameters */
-    real p_be1, p_be2;
-    real De_s, De_p, De_pp;
+    real p_be1;
+    real p_be2;
+    real De_s;
+    real De_p;
+    real De_pp;
 
     /* Over/Under coordination parameters */
     real p_ovun1;
@@ -326,7 +375,9 @@ typedef struct
     real alpha;
     real r_vdW;
     real gamma_w;
-    real rcore, ecore, acore;
+    real rcore;
+    real ecore;
+    real acore;
 
     /* electrostatic parameters */
     real gamma; // note: this parameter is gamma^-3 and not gamma.
@@ -335,13 +386,15 @@ typedef struct
 } two_body_parameters;
 
 
-
 /* 3-body parameters */
 typedef struct
 {
     /* valence angle */
     real theta_00;
-    real p_val1, p_val2, p_val4, p_val7;
+    real p_val1;
+    real p_val2;
+    real p_val4;
+    real p_val7;
 
     /* penalty */
     real p_pen1;
@@ -358,19 +411,22 @@ typedef struct
 } three_body_header;
 
 
-
 /* hydrogen-bond parameters */
 typedef struct
 {
-    real r0_hb, p_hb1, p_hb2, p_hb3;
+    real r0_hb;
+    real p_hb1;
+    real p_hb2;
+    real p_hb3;
 } hbond_parameters;
 
 
-
 /* 4-body parameters */
 typedef struct
 {
-    real V1, V2, V3;
+    real V1;
+    real V2;
+    real V3;
 
     /* torsion angle */
     real p_tor1;
@@ -401,14 +457,18 @@ typedef struct
 
 typedef struct
 {
-    int  type;           /* Type of this atom */
+    /* Type of this atom */
+    int type;
+    /**/
     char name[8];
-
-    rvec x; // position
-    rvec v; // velocity
-    rvec f; // force
-
-    real q;              /* Charge on the atom */
+    /* position */
+    rvec x;
+    /* velocity */
+    rvec v;
+    /* force */
+    rvec f;
+    /* Charge on the atom */
+    real q;
 } reax_atom;
 
 
@@ -419,7 +479,6 @@ typedef struct
     rvec box_norms;
     rvec side_prop;
     rvec nbr_box_press[27];
-    // rvec lower_end;
 
     rtensor box, box_inv, old_box;
     rtensor trans, trans_inv;
@@ -429,9 +488,9 @@ typedef struct
 
 typedef struct
 {
-    int  max_atoms;
-    int  max_nbrs;
-    int  total;
+    int max_atoms;
+    int max_nbrs;
+    int total;
     real cell_size;
     ivec spread;
 
@@ -440,10 +499,10 @@ typedef struct
     rvec inv_len;
 
     int**** atoms;
-    int***  top;
-    int***  mark;
-    int***  start;
-    int***  end;
+    int*** top;
+    int*** mark;
+    int*** start;
+    int*** end;
     ivec**** nbrs;
     rvec**** nbrs_cp;
 } grid;
@@ -451,10 +510,17 @@ typedef struct
 
 typedef struct
 {
+    /* number of atoms */
     int N;
+    /* dimension of the N x N sparse charge method matrix H */
+    int N_cm;
+    /* atom info */
     reax_atom *atoms;
+    /* atomic interaction parameters */
     reax_interaction reaxprm;
+    /* simulation space (a.k.a. box) parameters */
     simulation_box box;
+    /* grid structure used for binning atoms and tracking neighboring bins */
     grid g;
 } reax_system;
 
@@ -464,10 +530,10 @@ typedef struct
 {
     char sim_name[MAX_STR];
     char restart_from[MAX_STR];
-    int  restart;
-    int  random_vel;
+    int restart;
+    int random_vel;
 
-    int  reposition_atoms;
+    int reposition_atoms;
 
     /* ensemble values:
        0 : NVE
@@ -475,32 +541,45 @@ typedef struct
        2 : NPT  (Parrinello-Rehman-Nose-Hoover) Anisotropic
        3 : sNPT (Parrinello-Rehman-Nose-Hoover) semiisotropic
        4 : iNPT (Parrinello-Rehman-Nose-Hoover) isotropic */
-    int  ensemble;
-    int  nsteps;
-    int  periodic_boundaries;
-    int  restrict_bonds;
-    int  tabulate;
+    int ensemble;
+    int nsteps;
+    int periodic_boundaries;
+    int restrict_bonds;
+    int tabulate;
     ivec periodic_images;
     real dt;
 
     int reneighbor;
     real vlist_cut;
     real nbr_cut;
-    real r_cut, r_sp_cut, r_low; // upper and lower taper
+    real r_cut;
+    real r_sp_cut;
+    real r_low; // upper and lower taper
     real bo_cut;
     real thb_cut;
     real hb_cut;
-    real Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
-    int  max_far_nbrs;
-
-    real T_init, T_final, T;
+    real Tap7;
+    real Tap6;
+    real Tap5;
+    real Tap4;
+    real Tap3;
+    real Tap2;
+    real Tap1;
+    real Tap0;
+    int max_far_nbrs;
+
+    real T_init;
+    real T_final;
+    real T;
     real Tau_T;
-    int  T_mode;
-    real T_rate, T_freq;
+    int T_mode;
+    real T_rate;
+    real T_freq;
 
     real Tau_PT;
-    rvec P, Tau_P;
-    int  press_mode;
+    rvec P;
+    rvec Tau_P;
+    int press_mode;
     real compressibility;
 
     int remove_CoM_vel;
@@ -514,22 +593,30 @@ typedef struct
     int freq_diffusion_coef;
     int restrict_type;
 
-    unsigned int qeq_solver_type;
-    real qeq_solver_q_err;
-    real qeq_domain_sparsity;
-    unsigned int qeq_domain_sparsify_enabled;
-    unsigned int pre_comp_type;
-    unsigned int pre_comp_refactor;
-    real pre_comp_droptol;
-    unsigned int pre_comp_sweeps;
-    unsigned int pre_app_type;
-    unsigned int pre_app_jacobi_iters;
+    unsigned int charge_method;
+    unsigned int cm_solver_type;
+    real cm_q_net;
+    unsigned int cm_solver_max_iters;
+    unsigned int cm_solver_restart;
+    real cm_solver_q_err;
+    real cm_domain_sparsity;
+    unsigned int cm_domain_sparsify_enabled;
+    unsigned int cm_solver_pre_comp_type;
+    unsigned int cm_solver_pre_comp_refactor;
+    real cm_solver_pre_comp_droptol;
+    unsigned int cm_solver_pre_comp_sweeps;
+    unsigned int cm_solver_pre_app_type;
+    unsigned int cm_solver_pre_app_jacobi_iters;
 
     int molec_anal;
     int freq_molec_anal;
     real bg_cut;
     int num_ignored;
-    int  ignore[MAX_ATOM_TYPES];
+    int ignore[MAX_ATOM_TYPES];
+
+#ifdef _OPENMP
+    int num_threads;
+#endif
 } control_params;
 
 
@@ -583,15 +670,15 @@ typedef struct
     real init_forces;
     real bonded;
     real nonb;
-    real QEq;
-    real QEq_sort_mat_rows;
-    real pre_comp;
-    real pre_app;
-    int solver_iters;
-    real solver_spmv;
-    real solver_vector_ops;
-    real solver_orthog;
-    real solver_tri_solve;
+    real cm;
+    real cm_sort_mat_rows;
+    real cm_solver_pre_comp;
+    real cm_solver_pre_app;
+    int cm_solver_iters;
+    real cm_solver_spmv;
+    real cm_solver_vector_ops;
+    real cm_solver_orthog;
+    real cm_solver_tri_solve;
 } reax_timing;
 
 
@@ -653,9 +740,13 @@ typedef struct
 typedef struct
 {
     int thb;
-    int pthb; /* pointer to the third body on the central atom's nbrlist */
-    real theta, cos_theta;
-    rvec dcos_di, dcos_dj, dcos_dk;
+    /* pointer to the third body on the central atom's nbrlist */
+    int pthb;
+    real theta;
+    real cos_theta;
+    rvec dcos_di;
+    rvec dcos_dj;
+    rvec dcos_dk;
 } three_body_interaction_data;
 
 
@@ -663,7 +754,7 @@ typedef struct
 {
     int nbr;
     ivec rel_box;
-    //  rvec ext_factor;
+//    rvec ext_factor;
     real d;
     rvec dvec;
 } near_neighbor_data;
@@ -673,10 +764,9 @@ typedef struct
 {
     int nbr;
     ivec rel_box;
-    //  rvec ext_factor;
+//    rvec ext_factor;
     real d;
     rvec dvec;
-    // real H; //, Tap, inv_dr3gamij_1, inv_dr3gamij_3;
 } far_neighbor_data;
 
 
@@ -698,26 +788,46 @@ typedef struct
 typedef struct
 {
     int wrt;
-    rvec dBO, dBOpi, dBOpi2;
+    rvec dBO;
+    rvec dBOpi;
+    rvec dBOpi2;
 } dbond_data;
 
+
 typedef struct
 {
-    real BO, BO_s, BO_pi, BO_pi2;
-    real Cdbo, Cdbopi, Cdbopi2;
-    real C1dbo, C2dbo, C3dbo;
-    real C1dbopi, C2dbopi, C3dbopi, C4dbopi;
-    real C1dbopi2, C2dbopi2, C3dbopi2, C4dbopi2;
-    rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+    real BO;
+    real BO_s;
+    real BO_pi;
+    real BO_pi2;
+    real Cdbo;
+    real Cdbopi;
+    real Cdbopi2;
+    real C1dbo;
+    real C2dbo;
+    real C3dbo;
+    real C1dbopi;
+    real C2dbopi;
+    real C3dbopi;
+    real C4dbopi;
+    real C1dbopi2;
+    real C2dbopi2;
+    real C3dbopi2;
+    real C4dbopi2;
+    rvec dBOp;
+    rvec dln_BOp_s;
+    rvec dln_BOp_pi;
+    rvec dln_BOp_pi2;
 } bond_order_data;
 
+
 typedef struct
 {
     int nbr;
     int sym_index;
     int dbond_index;
     ivec rel_box;
-    //  rvec ext_factor;
+//    rvec ext_factor;
     real d;
     rvec dvec;
     bond_order_data bo_data;
@@ -732,11 +842,11 @@ typedef struct
  *   n: number of rows
  *   start: row pointer (last element contains ACTUAL NNZ)
  *   j: column index for corresponding matrix entry
- *   val: matrix entry
- * */
+ *   val: matrix entry */
 typedef struct
 {
-    unsigned int n, m;
+    unsigned int n;
+    unsigned int m;
     unsigned int *start;
     unsigned int *j;
     real *val;
@@ -755,49 +865,81 @@ typedef struct
     int gcell_atoms;
 } reallocate_data;
 
+
 typedef struct
 {
     /* bond order related storage */
     real *total_bond_order;
-    real *Deltap, *Deltap_boc;
-    real *Delta, *Delta_lp, *Delta_lp_temp, *Delta_e, *Delta_boc;
-    real *dDelta_lp, *dDelta_lp_temp;
-    real *nlp, *nlp_temp, *Clp, *vlpex;
+    real *Deltap;
+    real *Deltap_boc;
+    real *Delta;
+    real *Delta_lp;
+    real *Delta_lp_temp;
+    real *Delta_e;
+    real *Delta_boc;
+    real *dDelta_lp;
+    real *dDelta_lp_temp;
+    real *nlp;
+    real *nlp_temp;
+    real *Clp;
+    real *vlpex;
     rvec *dDeltap_self;
 
-    /* QEq storage */
-    sparse_matrix *H, *H_sp, *L, *U;
+    /* charge method storage */
+    sparse_matrix *H;
+    sparse_matrix *H_sp;
+    sparse_matrix *L;
+    sparse_matrix *U;
     real *droptol;
     real *w;
     real *Hdia_inv;
-    real *b, *b_s, *b_t, *b_prc, *b_prm;
-    real **s, **t;
-    real *s_t; //, *s_old, *t_old, *s_oldest, *t_oldest;
+    real *b;
+    real *b_s;
+    real *b_t;
+    real *b_prc;
+    real *b_prm;
+    real **s;
+    real **t;
 
     /* GMRES related storage */
-    real *y, *z, *g;
-    real *hc, *hs;
-    real **h, **rn, **v;
+    real *y;
+    real *z;
+    real *g;
+    real *hc;
+    real *hs;
+    real **h;
+    real **rn;
+    real **v;
     /* CG related storage */
-    real *r, *d, *q, *p;
-    int   s_dims, t_dims;
+    real *r;
+    real *d;
+    real *q;
+    real *p;
 
     int num_H;
     int *hbond_index; // for hydrogen bonds
 
-    rvec *v_const, *f_old, *a; // used in integrators
+    rvec *v_const;
+    rvec *f_old;
+    rvec *a; // used in integrators
 
     real *CdDelta;  // coefficient of dDelta for force calculations
 
-    int *mark, *old_mark;  // storage for analysis
+    int *mark;
+    int *old_mark;  // storage for analysis
     rvec *x_old;
 
     /* storage space for bond restrictions */
-    int  *map_serials;
-    int  *orig_id;
-    int  *restricted;
+    int *map_serials;
+    int *orig_id;
+    int *restricted;
     int **restricted_list;
 
+#ifdef _OPENMP
+    /* local forces per thread */
+    rvec *f_local;
+#endif
+
     reallocate_data realloc;
 
 #ifdef TEST_FORCES
@@ -814,7 +956,8 @@ typedef struct
     rvec *f_hb;
     rvec *f_tor;
     rvec *f_con;
-    rvec *dDelta;       /* Calculated on the fly in bond_orders.c */
+    /* Calculated on the fly in bond_orders.c */
+    rvec *dDelta;
 #endif
 } static_storage;
 
@@ -826,7 +969,6 @@ typedef struct
     int num_intrs;
     int *index;
     int *end_index;
-    int type;
     union
     {
         void *v;
@@ -847,45 +989,54 @@ typedef struct
     FILE *out;
     FILE *pot;
     FILE *log;
-    FILE *mol, *ign;
+    FILE *mol;
+    FILE *ign;
     FILE *dpl;
     FILE *drft;
     FILE *pdb;
     FILE *prs;
 
-    int  write_steps;
-    int  traj_compress;
-    int  traj_format;
+    int write_steps;
+    int traj_compress;
+    int traj_format;
     char traj_title[81];
-    int  atom_format;
-    int  bond_info;
-    int  angle_info;
+    int atom_format;
+    int bond_info;
+    int angle_info;
 
-    int  restart_format;
-    int  restart_freq;
-    int  debug_level;
-    int  energy_update_freq;
+    int restart_format;
+    int restart_freq;
+    int debug_level;
+    int energy_update_freq;
 
-    // trajectory output functions
+    /* trajectory output function pointer definitions */
     int (* write_header)( reax_system*, control_params*, static_storage*, void* );
     int (* append_traj_frame)(reax_system*, control_params*,
-                              simulation_data*, static_storage*, list **, void* );
+            simulation_data*, static_storage*, list **, void* );
     int (* write)( FILE *, const char *, ... );
 
 #ifdef TEST_ENERGY
     FILE *ebond;
-    FILE *elp, *eov, *eun;
-    FILE *eval, *epen, *ecoa;
+    FILE *elp;
+    FILE *eov;
+    FILE *eun;
+    FILE *eval;
+    FILE *epen;
+    FILE *ecoa;
     FILE *ehb;
-    FILE *etor, *econ;
-    FILE *evdw, *ecou;
+    FILE *etor;
+    FILE *econ;
+    FILE *evdw;
+    FILE *ecou;
 #endif
 
     FILE *ftot;
 #ifdef TEST_FORCES
-    FILE *fbo, *fdbo;
+    FILE *fbo;
+    FILE *fdbo;
     FILE *fbond;
-    FILE *flp, *fatom;
+    FILE *flp;
+    FILE *fatom;
     FILE *f3body;
     FILE *fhb;
     FILE *f4body;
@@ -895,33 +1046,32 @@ typedef struct
 } output_controls;
 
 
-typedef struct
-{
-    int atom_count;
-    int atom_list[MAX_MOLECULE_SIZE];
-    int mtypes[MAX_ATOM_TYPES];
-} molecule;
-
-
 typedef struct
 {
     real H;
-    real e_vdW, CEvd;
-    real e_ele, CEclmb;
+    real e_vdW;
+    real CEvd;
+    real e_ele;
+    real CEclmb;
 } LR_data;
 
 
-
 typedef struct
 {
-    real a, b, c, d;
+    real a;
+    real b;
+    real c;
+    real d;
 } cubic_spline_coef;
 
+
 typedef struct
 {
-    real xmin, xmax;
+    real xmin;
+    real xmax;
     int n;
-    real dx, inv_dx;
+    real dx;
+    real inv_dx;
     real a;
 
     real m;
@@ -933,35 +1083,35 @@ typedef struct
 
 typedef struct
 {
-    real xmin, xmax;
+    real xmin;
+    real xmax;
     int n;
-    real dx, inv_dx;
+    real dx;
+    real inv_dx;
     real a;
     real m;
     real c;
 
     LR_data *y;
     cubic_spline_coef *H;
-    cubic_spline_coef *vdW, *CEvd;
-    cubic_spline_coef *ele, *CEclmb;
+    cubic_spline_coef *vdW;
+    cubic_spline_coef *CEvd;
+    cubic_spline_coef *ele;
+    cubic_spline_coef *CEclmb;
 } LR_lookup_table;
 
 
+/* Function pointer definitions */
 typedef void (*interaction_function)(reax_system*, control_params*,
-                                     simulation_data*, static_storage*,
-                                     list**, output_controls*);
-interaction_function Interaction_Functions[NO_OF_INTERACTIONS];
+        simulation_data*, static_storage*, list**, output_controls*);
 
 typedef void (*evolve_function)(reax_system*, control_params*,
-                                simulation_data*, static_storage*,
-                                list**, output_controls*);
+        simulation_data*, static_storage*,
+        list**, output_controls*);
+
 
-typedef real (*lookup_function)(real);
-lookup_table Exp, Sqrt, Cube_Root, Four_Third_Root, Cos, Sin, ACos;
+/* Global variables */
 LR_lookup_table **LR;
 
 
-typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int*);
 #endif
diff --git a/sPuReMD/src/neighbors.c b/sPuReMD/src/neighbors.c
index fc318ef2d7d7022fc97df29548017299cd1f5e63..1963bd14580fffb3cd4895dfcee0df4d50c55985 100644
--- a/sPuReMD/src/neighbors.c
+++ b/sPuReMD/src/neighbors.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "neighbors.h"
+
 #include "box.h"
 #include "grid.h"
 #include "list.h"
@@ -28,6 +29,10 @@
 #include "vector.h"
 
 
+/* Function pointer definitions */
+typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
+        control_params*, far_neighbor_data*, int*);
+
 
 static inline real DistSqr_to_CP( rvec cp, rvec x )
 {
@@ -47,14 +52,14 @@ static inline real DistSqr_to_CP( rvec cp, rvec x )
 
 
 void Generate_Neighbor_Lists( reax_system *system, control_params *control,
-                              simulation_data *data, static_storage *workspace,
-                              list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int  i, j, k, l, m, itr;
-    int  x, y, z;
-    int  atom1, atom2, max;
-    int  num_far;
-    int  *nbr_atoms;
+    int i, j, k, l, m, itr;
+    int x, y, z;
+    int atom1, atom2, max;
+    int num_far;
+    int *nbr_atoms;
     ivec *nbrs;
     rvec *nbrs_cp;
     grid *g;
@@ -390,7 +395,7 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     // fprintf( stderr, "atoms sorted - " );
 
 #ifdef REORDER_ATOMS
-    Cluster_Atoms( system, workspace );
+    Cluster_Atoms( system, workspace, control );
     // fprintf( stderr, "atoms clustered - " );
 #endif
 
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 579ba6290a5c9812cd8b1342f96f959f597d6709..9ff0f571be830e29591aaeee2695b8f441077fda 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "print_utils.h"
+
 #include "list.h"
 #include "geo_tools.h"
 #include "system_props.h"
@@ -28,15 +29,15 @@
 
 #ifdef TEST_FORCES
 void Dummy_Printer( reax_system *system, control_params *control,
-                    simulation_data *data, static_storage *workspace,
-                    list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
 }
 
 
 void Print_Bond_Orders( reax_system *system, control_params *control,
-                        simulation_data *data, static_storage *workspace,
-                        list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int  i, pj, pk;
     bond_order_data *bo_ij;
@@ -44,12 +45,12 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
     list *dBOs  = (*lists) + DBO;
     dbond_data *dbo_k;
 
-
     /* bond orders */
     fprintf( out_control->fbo, "%6s%6s%12s%12s%12s%12s%12s\n",
              "atom1", "atom2", "r_ij", "total_bo", "bo_s", "bo_p", "bo_pp" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             bo_ij = &(bonds->select.bond_list[pj].bo_data);
@@ -61,12 +62,14 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
                      bonds->select.bond_list[pj].d,
                      bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
         }
+    }
 
     /* derivatives of bond orders */
     /* fprintf( out_control->fbo, "%6s%6s%10s%10s%10s%10s\n",
        "atom1", "atom2", "total_bo", "bo_s", "bo_p", "bo_pp"\n ); */
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             /*fprintf( out_control->fdbo, "%6d %6d\tstart: %6d\tend: %6d\n",
@@ -98,14 +101,15 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
                          dbo_k->dBOpi2[0], dbo_k->dBOpi2[1], dbo_k->dBOpi2[2] );
             }
         }
+    }
 
-    fflush(out_control->fdbo);
+    fflush( out_control->fdbo );
 }
 
 
 void Print_Bond_Forces( reax_system *system, control_params *control,
-                        simulation_data *data, static_storage *workspace,
-                        list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i;
 
@@ -113,15 +117,17 @@ void Print_Bond_Forces( reax_system *system, control_params *control,
     fprintf( out_control->fbond, "%6s\t%s\n", "atom", "fbond" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         fprintf(out_control->fbond, "%6d %23.15e%23.15e%23.15e\n",
                 workspace->orig_id[i],
                 workspace->f_be[i][0], workspace->f_be[i][1], workspace->f_be[i][2]);
+    }
 }
 
 
 void Print_LonePair_Forces( reax_system *system, control_params *control,
-                            simulation_data *data, static_storage *workspace,
-                            list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i;
 
@@ -129,18 +135,19 @@ void Print_LonePair_Forces( reax_system *system, control_params *control,
     fprintf( out_control->flp, "%6s\t%s\n", "atom", "f_lonepair" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         fprintf(out_control->flp, "%6d %23.15e%23.15e%23.15e\n",
                 workspace->orig_id[i],
                 workspace->f_lp[i][0], workspace->f_lp[i][1], workspace->f_lp[i][2]);
+    }
 
-    fflush(out_control->flp);
+    fflush( out_control->flp );
 }
 
 
 void Print_OverUnderCoor_Forces( reax_system *system, control_params *control,
-                                 simulation_data *data,
-                                 static_storage *workspace, list **lists,
-                                 output_controls *out_control )
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int i;
 
@@ -151,11 +158,14 @@ void Print_OverUnderCoor_Forces( reax_system *system, control_params *control,
     for ( i = 0; i < system->N; ++i )
     {
         if ( rvec_isZero( workspace->f_un[i] ) )
+        {
             fprintf( out_control->fatom,
                      "%6d %23.15e%23.15e%23.15e 0 0 0\n",
                      workspace->orig_id[i], workspace->f_ov[i][0],
                      workspace->f_ov[i][1], workspace->f_ov[i][2] );
+        }
         else
+        {
             fprintf( out_control->fatom,
                      "%6d %23.15e%23.15e%23.15e %23.15e%23.15e%23.15e"\
                      "%23.15e%23.15e%23.15e\n",
@@ -167,15 +177,16 @@ void Print_OverUnderCoor_Forces( reax_system *system, control_params *control,
                      workspace->f_ov[i][2],
                      workspace->f_un[i][0], workspace->f_un[i][1],
                      workspace->f_un[i][2] );
+        }
     }
 
-    fflush(out_control->fatom);
+    fflush( out_control->fatom );
 }
 
 
 void Print_Three_Body_Forces( reax_system *system, control_params *control,
-                              simulation_data *data, static_storage *workspace,
-                              list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int j;
 
@@ -186,10 +197,13 @@ void Print_Three_Body_Forces( reax_system *system, control_params *control,
     for ( j = 0; j < system->N; ++j )
     {
         if ( rvec_isZero(workspace->f_pen[j]) && rvec_isZero(workspace->f_coa[j]) )
+        {
             fprintf( out_control->f3body, "%6d %23.15e%23.15e%23.15e  0 0 0  0 0 0\n",
                      workspace->orig_id[j], workspace->f_ang[j][0],
                      workspace->f_ang[j][1], workspace->f_ang[j][2] );
+        }
         else if ( rvec_isZero(workspace->f_coa[j]) )
+        {
             fprintf( out_control->f3body,
                      "%6d %23.15e%23.15e%23.15e %23.15e%23.15e%23.15e "\
                      "%23.15e%23.15e%23.15e\n",
@@ -201,6 +215,7 @@ void Print_Three_Body_Forces( reax_system *system, control_params *control,
                      workspace->f_ang[j][2],
                      workspace->f_pen[j][0], workspace->f_pen[j][1],
                      workspace->f_pen[j][2] );
+        }
         else
         {
             fprintf( out_control->f3body, "%6d %23.15e%23.15e%23.15e ",
@@ -224,14 +239,13 @@ void Print_Three_Body_Forces( reax_system *system, control_params *control,
         }
     }
 
-    fflush(out_control->f3body);
+    fflush( out_control->f3body );
 }
 
 
 void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control,
-                                 simulation_data *data,
-                                 static_storage *workspace, list **lists,
-                                 output_controls *out_control )
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
     int j;
 
@@ -239,17 +253,19 @@ void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control,
     fprintf( out_control->fhb, "%6s\t%-38s\n", "atom", "f_hb" );
 
     for ( j = 0; j < system->N; ++j )
+    {
         fprintf(out_control->fhb, "%6d\t[%23.15e%23.15e%23.15e]\n",
                 workspace->orig_id[j],
                 workspace->f_hb[j][0], workspace->f_hb[j][1], workspace->f_hb[j][2]);
+    }
 
     fflush(out_control->fhb);
 }
 
 
 void Print_Four_Body_Forces( reax_system *system, control_params *control,
-                             simulation_data *data, static_storage *workspace,
-                             list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int j;
 
@@ -259,6 +275,7 @@ void Print_Four_Body_Forces( reax_system *system, control_params *control,
     for ( j = 0; j < system->N; ++j )
     {
         if ( !rvec_isZero( workspace->f_con[j] ) )
+        {
             fprintf( out_control->f4body,
                      "%6d %23.15e%23.15e%23.15e %23.15e%23.15e%23.15e "\
                      "%23.15e%23.15e%23.15e\n",
@@ -270,20 +287,23 @@ void Print_Four_Body_Forces( reax_system *system, control_params *control,
                      workspace->f_tor[j][2],
                      workspace->f_con[j][0], workspace->f_con[j][1],
                      workspace->f_con[j][2] );
+        }
         else
+        {
             fprintf( out_control->f4body,
                      "%6d %23.15e%23.15e%23.15e  0 0 0\n",
                      workspace->orig_id[j], workspace->f_tor[j][0],
                      workspace->f_tor[j][1], workspace->f_tor[j][2] );
+        }
     }
 
-    fflush(out_control->f4body);
+    fflush( out_control->f4body );
 }
 
 
 void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control,
-                               simulation_data *data, static_storage *workspace,
-                               list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int  i;
 
@@ -292,7 +312,9 @@ void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control,
              "atom", "nonbonded total", "f_vdw", "f_ele" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         if ( !rvec_isZero(workspace->f_ele[i]) )
+        {
             fprintf(out_control->fnonb,
                     "%6d %23.15e%23.15e%23.15e %23.15e%23.15e%23.15e "\
                     "%23.15e%23.15e%23.15e\n",
@@ -304,19 +326,23 @@ void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control,
                     workspace->f_vdw[i][2],
                     workspace->f_ele[i][0], workspace->f_ele[i][1],
                     workspace->f_ele[i][2] );
+        }
         else
+        {
             fprintf(out_control->fnonb,
                     "%6d %23.15e%23.15e%23.15e  0 0 0\n",
                     workspace->orig_id[i], workspace->f_vdw[i][0],
                     workspace->f_vdw[i][1], workspace->f_vdw[i][2] );
+        }
+    }
 
-    fflush(out_control->fnonb);
+    fflush( out_control->fnonb );
 }
 
 
 void Compare_Total_Forces( reax_system *system, control_params *control,
-                           simulation_data *data, static_storage *workspace,
-                           list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i;
 
@@ -325,6 +351,7 @@ void Compare_Total_Forces( reax_system *system, control_params *control,
              "atom", "f_total", "test_force total" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         fprintf( out_control->ftot2,
                  "%6d %23.15e%23.15e%23.15e vs %23.15e%23.15e%23.15e\n",
                  workspace->orig_id[i],
@@ -347,8 +374,9 @@ void Compare_Total_Forces( reax_system *system, control_params *control,
                  workspace->f_coa[i][2] + workspace->f_hb[i][2] +
                  workspace->f_tor[i][2] + workspace->f_con[i][2] +
                  workspace->f_vdw[i][2] + workspace->f_ele[i][2] );
+    }
 
-    fflush(out_control->ftot2);
+    fflush( out_control->ftot2 );
 }
 
 
@@ -370,18 +398,16 @@ void Init_Force_Test_Functions( )
 
 /* near nbrs contain both i-j, j-i nbrhood info */
 void Print_Near_Neighbors( reax_system *system, control_params *control,
-                           static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-    int   i, j, id_i, id_j;
-    char  fname[MAX_STR];
+    int i, j, id_i, id_j;
+    char fname[MAX_STR];
     FILE *fout;
     list *near_nbrs = &((*lists)[NEAR_NBRS]);
 
     sprintf( fname, "%s.near_nbrs", control->sim_name );
     fout = fopen( fname, "w" );
 
-    fprintf( fout, "hello:!\n" );
-
     for ( i = 0; i < system->N; ++i )
     {
         id_i = workspace->orig_id[i];
@@ -406,10 +432,10 @@ void Print_Near_Neighbors( reax_system *system, control_params *control,
 
 /* near nbrs contain both i-j, j-i nbrhood info */
 void Print_Near_Neighbors2( reax_system *system, control_params *control,
-                            static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-    int   i, j, id_i, id_j;
-    char  fname[MAX_STR];
+    int i, j, id_i, id_j;
+    char fname[MAX_STR];
     FILE *fout;
     list *near_nbrs = &((*lists)[NEAR_NBRS]);
 
@@ -441,10 +467,10 @@ void Print_Near_Neighbors2( reax_system *system, control_params *control,
 
 /* far nbrs contain only i-j nbrhood info, no j-i. */
 void Print_Far_Neighbors( reax_system *system, control_params *control,
-                          static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-    int   i, j, id_i, id_j;
-    char  fname[MAX_STR];
+    int i, j, id_i, id_j;
+    char fname[MAX_STR];
     FILE *fout;
     list *far_nbrs = &((*lists)[FAR_NBRS]);
 
@@ -486,10 +512,10 @@ int fn_qsort_intcmp( const void *a, const void *b )
 
 
 void Print_Far_Neighbors2( reax_system *system, control_params *control,
-                           static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-    int   i, j, id_i, id_j;
-    char  fname[MAX_STR];
+    int i, j, id_i, id_j;
+    char fname[MAX_STR];
     FILE *fout;
     list *far_nbrs = &((*lists)[FAR_NBRS]);
 
@@ -514,13 +540,14 @@ void Print_Far_Neighbors2( reax_system *system, control_params *control,
             fprintf(fout, "%6d", temp[j]);
         fprintf( fout, "\n");
     }
+
     fclose( fout );
 }
 
 
 void Print_Total_Force( reax_system *system, control_params *control,
-                        simulation_data *data, static_storage *workspace,
-                        list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
     int i;
 #if !defined(TEST_FORCES)
@@ -530,51 +557,54 @@ void Print_Total_Force( reax_system *system, control_params *control,
 #endif
 
     for ( i = 0; i < system->N; ++i )
-        fprintf(out_control->ftot, "%6d %23.15e %23.15e %23.15e\n",
+    {
+        fprintf( out_control->ftot, "%6d %23.15e %23.15e %23.15e\n",
                 //fprintf(out_control->ftot, "%6d %19.9e %19.9e %19.9e\n",
                 //fprintf(out_control->ftot, "%3d %12.6f %12.6f %12.6f\n",
                 workspace->orig_id[i],
-                system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2]);
+                system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] );
+    }
 
-    fflush(out_control->ftot);
+    fflush( out_control->ftot );
 #if !defined(TEST_FORCES)
-    fclose(out_control->ftot);
+    fclose( out_control->ftot );
 #endif
 }
 
 
 void Output_Results( reax_system *system, control_params *control,
-                     simulation_data *data, static_storage *workspace,
-                     list **lists, output_controls *out_control )
+    simulation_data *data, static_storage *workspace,
+    list **lists, output_controls *out_control )
 {
-    int i, type_i, f_update;
-    real q;
-    real t_elapsed = 0;;
-
+    int i, type_i;
+    real e_pol, q, f_update;
+    real t_elapsed = 0;
 
     /* Compute Polarization Energy */
-    data->E_Pol = 0.0;
+    e_pol = 0.0;
+
+#ifdef _OPENMP
+    #pragma omp parallel for default(none) private(q, type_i,) shared(system) \
+        reduction(+: e_pol) schedule(static)
+#endif
     for ( i = 0; i < system->N; i++ )
     {
         q = system->atoms[i].q;
         type_i = system->atoms[i].type;
 
-        data->E_Pol += ( system->reaxprm.sbp[ type_i ].chi * q +
-                         (system->reaxprm.sbp[ type_i ].eta / 2.0) * SQR(q) ) *
-                       KCALpMOL_to_EV;
-        /* fprintf( stderr, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-           i, q, system->reaxprm.sbp[ type_i ].chi,
-           system->reaxprm.sbp[ type_i ].eta, data->E_Pol ); */
+        e_pol += ( system->reaxprm.sbp[ type_i ].chi * q +
+                (system->reaxprm.sbp[ type_i ].eta / 2.0) * SQR( q ) ) *
+            KCALpMOL_to_EV;
     }
 
+    data->E_Pol = e_pol;
+
     data->E_Pot = data->E_BE + data->E_Ov + data->E_Un  + data->E_Lp +
-                  data->E_Ang + data->E_Pen + data->E_Coa + data->E_HB +
-                  data->E_Tor + data->E_Con +
-                  data->E_vdW + data->E_Ele + data->E_Pol;
+        data->E_Ang + data->E_Pen + data->E_Coa + data->E_HB +
+        data->E_Tor + data->E_Con + data->E_vdW + data->E_Ele + data->E_Pol;
 
     data->E_Tot = data->E_Pot + E_CONV * data->E_Kin;
 
-
     /* output energies if it is the time */
     if ( out_control->energy_update_freq > 0 &&
             data->step % out_control->energy_update_freq == 0 )
@@ -584,7 +614,7 @@ void Output_Results( reax_system *system, control_params *control,
                  "%-6d%24.15e%24.15e%24.15e%13.5f%13.5f%16.5f%13.5f%13.5f\n",
                  data->step, data->E_Tot, data->E_Pot, E_CONV * data->E_Kin,
                  data->therm.T, control->T, system->box.volume, data->iso_bar.P,
-                 (control->P[0] + control->P[1] + control->P[2]) / 3 );
+                 (control->P[0] + control->P[1] + control->P[2]) / 3.0 );
 
         fprintf( out_control->pot,
                  "%-6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
@@ -599,7 +629,7 @@ void Output_Results( reax_system *system, control_params *control,
                  "%-6d%16.2f%16.2f%16.2f%11.2f%11.2f%13.2f%13.5f%13.5f\n",
                  data->step, data->E_Tot, data->E_Pot, E_CONV * data->E_Kin,
                  data->therm.T, control->T, system->box.volume, data->iso_bar.P,
-                 (control->P[0] + control->P[1] + control->P[2]) / 3 );
+                 (control->P[0] + control->P[1] + control->P[2]) / 3.0 );
 
         fprintf( out_control->pot,
                  "%-6d%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f%13.2f\n",
@@ -613,39 +643,44 @@ void Output_Results( reax_system *system, control_params *control,
 
         t_elapsed = Get_Timing_Info( data->timing.total );
         if ( data->step == data->prev_steps )
-            f_update = 1;
-        else f_update = out_control->energy_update_freq;
+        {
+            f_update = 1.0;
+        }
+        else
+        {
+            f_update = 1.0 / out_control->energy_update_freq;
+        }
 
         fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
-                 data->step, t_elapsed / f_update,
-                 data->timing.nbrs / f_update,
-                 data->timing.init_forces / f_update,
-                 data->timing.bonded / f_update,
-                 data->timing.nonb / f_update,
-                 data->timing.QEq / f_update,
-                 data->timing.QEq_sort_mat_rows / f_update,
-                 (double)data->timing.solver_iters / f_update,
-                 data->timing.pre_comp / f_update,
-                 data->timing.pre_app / f_update,
-                 data->timing.solver_spmv / f_update,
-                 data->timing.solver_vector_ops / f_update,
-                 data->timing.solver_orthog / f_update,
-                 data->timing.solver_tri_solve / f_update );
+                 data->step, t_elapsed * f_update,
+                 data->timing.nbrs * f_update,
+                 data->timing.init_forces * f_update,
+                 data->timing.bonded * f_update,
+                 data->timing.nonb * f_update,
+                 data->timing.cm * f_update,
+                 data->timing.cm_sort_mat_rows * f_update,
+                 (double)data->timing.cm_solver_iters * f_update,
+                 data->timing.cm_solver_pre_comp * f_update,
+                 data->timing.cm_solver_pre_app * f_update,
+                 data->timing.cm_solver_spmv * f_update,
+                 data->timing.cm_solver_vector_ops * f_update,
+                 data->timing.cm_solver_orthog * f_update,
+                 data->timing.cm_solver_tri_solve * f_update );
 
         data->timing.total = Get_Time( );
         data->timing.nbrs = 0;
         data->timing.init_forces = 0;
         data->timing.bonded = 0;
         data->timing.nonb = 0;
-        data->timing.QEq = ZERO;
-        data->timing.QEq_sort_mat_rows = ZERO;
-        data->timing.pre_comp = ZERO;
-        data->timing.pre_app = ZERO;
-        data->timing.solver_iters = 0;
-        data->timing.solver_spmv = ZERO;
-        data->timing.solver_vector_ops = ZERO;
-        data->timing.solver_orthog = ZERO;
-        data->timing.solver_tri_solve = ZERO;
+        data->timing.cm = ZERO;
+        data->timing.cm_sort_mat_rows = ZERO;
+        data->timing.cm_solver_pre_comp = ZERO;
+        data->timing.cm_solver_pre_app = ZERO;
+        data->timing.cm_solver_iters = 0;
+        data->timing.cm_solver_spmv = ZERO;
+        data->timing.cm_solver_vector_ops = ZERO;
+        data->timing.cm_solver_orthog = ZERO;
+        data->timing.cm_solver_tri_solve = ZERO;
 
         fflush( out_control->out );
         fflush( out_control->pot );
@@ -672,39 +707,37 @@ void Output_Results( reax_system *system, control_params *control,
                      system->box.box_norms[2],
                      data->tot_press[0], data->tot_press[1], data->tot_press[2],
                      control->P[0], control->P[1], control->P[2], system->box.volume );
-            fflush( out_control->prs);
+            fflush( out_control->prs );
         }
     }
 
     if ( out_control->write_steps > 0 &&
             data->step % out_control->write_steps == 0 )
     {
-        // t_start = Get_Time( );
+        //t_start = Get_Time( );
         out_control->append_traj_frame( system, control, data,
-                                        workspace, lists, out_control );
+                workspace, lists, out_control );
 
         //Write_PDB( system, *lists+BONDS, data, control, workspace, out_control );
-        // t_elapsed = Get_Timing_Info( t_start );
-        // fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
+        //t_elapsed = Get_Timing_Info( t_start );
+        //fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
     }
-
-    // fprintf( stderr, "output_results... done\n" );
 }
 
 
 
 void Print_Linear_System( reax_system *system, control_params *control,
-                          static_storage *workspace, int step )
+        static_storage *workspace, int step )
 {
-    int   i, j;
-    char  fname[100];
+    int i, j;
+    char fname[100];
     sparse_matrix *H;
     FILE *out;
 
     sprintf( fname, "%s.state%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
 
-    for ( i = 0; i < system->N; i++ )
+    for ( i = 0; i < system->N_cm; i++ )
         fprintf( out, "%6d%2d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
                  workspace->orig_id[i], system->atoms[i].type,
                  system->atoms[i].x[0], system->atoms[i].x[1],
@@ -719,12 +752,11 @@ void Print_Linear_System( reax_system *system, control_params *control,
     // fprintf( out, "%g\n", workspace->s_t[i+system->N] );
     // fclose( out );
 
-
     sprintf( fname, "%s.H%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
     H = workspace->H;
 
-    for ( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N_cm; ++i )
     {
         for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
         {
@@ -747,7 +779,7 @@ void Print_Linear_System( reax_system *system, control_params *control,
     out = fopen( fname, "w" );
     H = workspace->H_sp;
 
-    for ( i = 0; i < system->N; ++i )
+    for ( i = 0; i < system->N_cm; ++i )
     {
         for ( j = H->start[i]; j < H->start[i + 1] - 1; ++j )
         {
@@ -781,9 +813,9 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
 
 void Print_Charges( reax_system *system, control_params *control,
-                    static_storage *workspace, int step )
+        static_storage *workspace, int step )
 {
-    int   i;
+    int i;
     char fname[100];
     FILE *fout;
 
@@ -791,24 +823,28 @@ void Print_Charges( reax_system *system, control_params *control,
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         fprintf( fout, "%6d%12.7f%12.7f%12.7f\n",
                  workspace->orig_id[i],
                  workspace->s[0][i], workspace->t[0][i], system->atoms[i].q );
+    }
 
     fclose( fout );
 }
 
 
 void Print_Soln( static_storage *workspace,
-                 real *x, real *b_prm, real *b, int N )
+        real *x, real *b_prm, real *b, int N )
 {
     int i;
 
     fprintf( stdout, "%6s%10s%10s%10s\n", "id", "x", "b_prm", "b" );
 
     for ( i = 0; i < N; ++i )
+    {
         fprintf( stdout, "%6d%10.4f%10.4f%10.4f\n",
                  workspace->orig_id[i], x[i], b_prm[i], b[i] );
+    }
 
     fflush( stdout );
 }
@@ -822,24 +858,77 @@ void Print_Sparse_Matrix( sparse_matrix *A )
     {
         fprintf( stderr, "i:%d  j(val):", i );
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
+        {
             fprintf( stderr, "%d(%.4f) ", A->j[j], A->val[j] );
+        }
         fprintf( stderr, "\n" );
     }
 }
 
 
-void Print_Sparse_Matrix2( sparse_matrix *A, char *fname )
+void Print_Sparse_Matrix2( sparse_matrix *A, char *fname, char *mode )
 {
     int i, j;
-    FILE *f = fopen( fname, "w" );
+    FILE *f;
+   
+    if ( mode == NULL )
+    {
+        f = fopen( fname, "w" );
+    }
+    else
+    {
+        f = fopen( fname, mode );
+    }
 
     for ( i = 0; i < A->n; ++i )
+    {
+        /* off-diagonals */
+        for ( j = A->start[i]; j < A->start[i + 1] - 1; ++j )
+        {
+            //Convert 0-based to 1-based (for Matlab)
+            fprintf( f, "%6d %6d %24.15e\n", i + 1, A->j[j] + 1, A->val[j] );
+            /* print symmetric entry */
+//            fprintf( f, "%6d %6d %24.15e\n", A->j[j] + 1, i + 1, A->val[j] );
+        }
+
+        /* diagonal */
+        fprintf( f, "%6d %6d %24.15e\n", i + 1, A->j[A->start[i + 1] - 1] + 1, A->val[A->start[i + 1] - 1] );
+    }
+
+    fclose( f );
+}
+
+
+/* Note: watch out for portability issues with endianness
+ * due to serialization of numeric types (integer, IEEE 754) */
+void Print_Sparse_Matrix_Binary( sparse_matrix *A, char *fname )
+{
+    int i, j, temp;
+    FILE *f;
+   
+    f = fopen( fname, "wb" );
+
+    /* header: # rows, # nonzeros */
+    fwrite( &(A->n), sizeof(unsigned int), 1, f );
+    fwrite( &(A->start[A->n]), sizeof(unsigned int), 1, f );
+
+    /* row pointers */
+    for ( i = 0; i <= A->n; ++i )
+    {
+        //Convert 0-based to 1-based (for Matlab)
+        temp = A->start[i] + 1;
+        fwrite( &temp, sizeof(unsigned int), 1, f );
+    }
+
+    /* column indices and non-zeros */
+    for ( i = 0; i <= A->n; ++i )
     {
         for ( j = A->start[i]; j < A->start[i + 1]; ++j )
         {
-            //fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
             //Convert 0-based to 1-based (for Matlab)
-            fprintf( f, "%6d%6d %24.15e\n", i+1, A->j[j]+1, A->val[j] );
+            temp = A->j[j] + 1;
+            fwrite( &temp, sizeof(unsigned int), 1, f );
+            fwrite( &(A->val[j]), sizeof(real), 1, f );
         }
     }
 
@@ -855,6 +944,7 @@ void Print_Bonds( reax_system *system, list *bonds, char *fname )
     FILE *f = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             pbond = &(bonds->select.bond_list[pj]);
@@ -865,7 +955,9 @@ void Print_Bonds( reax_system *system, list *bonds, char *fname )
             fprintf( f, "%6d%6d %9.5f %9.5f\n",
                      i + 1, pbond->nbr + 1, pbond->d, bo_ij->BO );
         }
-    fclose(f);
+    }
+
+    fclose( f );
 }
 
 
@@ -886,35 +978,41 @@ void Print_Bond_List2( reax_system *system, list *bonds, char *fname )
             nbr = bonds->select.bond_list[pj].nbr;
             id_j = nbr + 1; //system->my_atoms[nbr].orig_id;
             if ( id_i < id_j )
+            {
                 temp[num++] = id_j;
+            }
         }
 
         qsort(&temp, num, sizeof(int), fn_qsort_intcmp);
-        for (j = 0; j < num; j++)
-            fprintf(f, "%6d", temp[j] );
-        fprintf(f, "\n");
+        for ( j = 0; j < num; j++ )
+        {
+            fprintf( f, "%6d", temp[j] );
+        }
+        fprintf( f, "\n" );
     }
 }
 
 
 #ifdef LGJ
-Print_XYZ_Serial(reax_system* system, static_storage *workspace)
+Print_XYZ_Serial( reax_system* system, static_storage *workspace )
 {
     rvec p;
-
-    char  fname[100];
+    char fname[100];
     FILE *fout;
-    sprintf( fname, "READ_PDB.0" );
-    fout      = fopen( fname, "w" );
     int i;
-    for (i = 0; i < system->N; i++)
+
+    sprintf( fname, "READ_PDB.0" );
+    fout = fopen( fname, "w" );
+
+    for ( i = 0; i < system->N; i++ )
+    {
         fprintf( fout, "%6d%24.15e%24.15e%24.15e\n",
                  workspace->orig_id[i],
                  p[0] = system->atoms[i].x[0],
                  p[1] = system->atoms[i].x[1],
-                 p[2] = system->atoms[i].x[2]);
-
+                 p[2] = system->atoms[i].x[2] );
+    }
 
-    fclose(fout);
+    fclose( fout );
 }
 #endif
diff --git a/sPuReMD/src/print_utils.h b/sPuReMD/src/print_utils.h
index 8b3b363571ca5b7c50195b4c91bca8951c1caa7b..8c15ebbe81487eabf2a5db13627eef60ee8432d3 100644
--- a/sPuReMD/src/print_utils.h
+++ b/sPuReMD/src/print_utils.h
@@ -55,7 +55,9 @@ void Print_Soln( static_storage*, real*, real*, real*, int );
 
 void Print_Sparse_Matrix( sparse_matrix* );
 
-void Print_Sparse_Matrix2( sparse_matrix*, char* );
+void Print_Sparse_Matrix2( sparse_matrix*, char*, char* );
+
+void Print_Sparse_Matrix_Binary( sparse_matrix*, char* );
 
 void Print_Bonds( reax_system*, list*, char* );
 void Print_Bond_List2( reax_system*, list*, char* );
diff --git a/sPuReMD/src/random.c b/sPuReMD/src/random.c
index f3a5096c65485111fb5cba7321976518a2e42972..9b09e7526b7a8418470cbf8c1b45bd1940dcbfa9 100644
--- a/sPuReMD/src/random.c
+++ b/sPuReMD/src/random.c
@@ -19,7 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include <random.h>
+#include "random.h"
+
 
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
@@ -53,5 +54,5 @@ double GRandom(double mean, double sigma)
         rsq = v1 * v1 + v2 * v2;
     }
 
-    return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
 }
diff --git a/sPuReMD/src/reset_utils.c b/sPuReMD/src/reset_utils.c
index 36d6b414063c4c37a7399c07b039cdc0b80d8a85..0a364e69c003f8c4c1b637f390103e44d35ee6da 100644
--- a/sPuReMD/src/reset_utils.c
+++ b/sPuReMD/src/reset_utils.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "reset_utils.h"
+
 #include "list.h"
 #include "vector.h"
 
@@ -29,36 +30,36 @@ void Reset_Atoms( reax_system* system )
     int i;
 
     for ( i = 0; i < system->N; ++i )
+    {
         memset( system->atoms[i].f, 0.0, sizeof(rvec) );
+    }
 }
 
 
 void Reset_Pressures( simulation_data *data )
 {
     rtensor_MakeZero( data->flex_bar.P );
-    data->iso_bar.P = 0;
+    data->iso_bar.P = 0.0;
     rvec_MakeZero( data->int_press );
     rvec_MakeZero( data->ext_press );
-    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n",
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 }
 
 
 void Reset_Simulation_Data( simulation_data* data )
 {
-    data->E_BE = 0;
-    data->E_Ov = 0;
-    data->E_Un = 0;
-    data->E_Lp = 0;
-    data->E_Ang = 0;
-    data->E_Pen = 0;
-    data->E_Coa = 0;
-    data->E_HB = 0;
-    data->E_Tor = 0;
-    data->E_Con = 0;
-    data->E_vdW = 0;
-    data->E_Ele = 0;
-    data->E_Kin = 0;
+    data->E_BE = 0.0;
+    data->E_Ov = 0.0;
+    data->E_Un = 0.0;
+    data->E_Lp = 0.0;
+    data->E_Ang = 0.0;
+    data->E_Pen = 0.0;
+    data->E_Coa = 0.0;
+    data->E_HB = 0.0;
+    data->E_Tor = 0.0;
+    data->E_Con = 0.0;
+    data->E_vdW = 0.0;
+    data->E_Ele = 0.0;
+    data->E_Kin = 0.0;
 }
 
 
@@ -84,12 +85,28 @@ void Reset_Test_Forces( reax_system *system, static_storage *workspace )
 
 void Reset_Workspace( reax_system *system, static_storage *workspace )
 {
+#ifdef _OPENMP
+    int i, tid;
+#endif
+
     memset( workspace->total_bond_order, 0, system->N * sizeof( real ) );
     memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) );
 
     memset( workspace->CdDelta, 0, system->N * sizeof( real ) );
     //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) );
 
+#ifdef _OPENMP
+    #pragma omp parallel private(i, tid)
+    {
+        tid = omp_get_thread_num( );
+
+        for ( i = 0; i < system->N; ++i )
+        {
+            rvec_MakeZero( workspace->f_local[tid * system->N + i] );
+        }
+    }
+#endif
+
 #ifdef TEST_FORCES
     memset( workspace->dDelta, 0, sizeof(rvec) * system->N );
     Reset_Test_Forces( system, workspace );
@@ -111,7 +128,9 @@ void Reset_Neighbor_Lists( reax_system *system, control_params *control,
     }
 
     if ( control->hb_cut > 0 )
+    {
         for ( i = 0; i < system->N; ++i )
+        {
             if ( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1)
             {
                 tmp = Start_Index( workspace->hbond_index[i], hbonds );
@@ -120,19 +139,19 @@ void Reset_Neighbor_Lists( reax_system *system, control_params *control,
                    i, Start_Index( workspace->hbond_index[i], hbonds ),
                    End_Index( workspace->hbond_index[i], hbonds ) );*/
             }
+        }
+    }
 }
 
 
 void Reset( reax_system *system, control_params *control,
-            simulation_data *data, static_storage *workspace, list **lists  )
+        simulation_data *data, static_storage *workspace, list **lists  )
 {
     Reset_Atoms( system );
 
     Reset_Simulation_Data( data );
 
-    if ( control->ensemble == NPT || control->ensemble == sNPT ||
-            control->ensemble == iNPT )
-        Reset_Pressures( data );
+    Reset_Pressures( data );
 
     Reset_Workspace( system, workspace );
 
diff --git a/sPuReMD/src/single_body_interactions.c b/sPuReMD/src/single_body_interactions.c
index 484104a8d8dff533ab12c7033e35969d69f0a60d..9da3e21aace9d38063b9343f486ae6495b6e8f78 100644
--- a/sPuReMD/src/single_body_interactions.c
+++ b/sPuReMD/src/single_body_interactions.c
@@ -26,11 +26,8 @@
 #include "vector.h"
 
 
-void LonePair_OverUnder_Coordination_Energy( reax_system *system,
-        control_params *control,
-        simulation_data *data,
-        static_storage *workspace,
-        list **lists,
+void LonePair_OverUnder_Coordination_Energy( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list **lists,
         output_controls *out_control )
 {
     int i, j, pj, type_i, type_j;
@@ -44,8 +41,7 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
     real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
     real p_lp1, p_lp2, p_lp3;
     real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-    single_body_parameters *sbp_i, *sbp_j;
+    single_body_parameters *sbp_i;
     two_body_parameters *twbp;
     bond_data *pbond;
     bond_order_data *bo_ij;
@@ -152,18 +148,11 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
             j = bonds->select.bond_list[pj].nbr;
             type_j = system->atoms[j].type;
             bo_ij = &(bonds->select.bond_list[pj].bo_data);
-            sbp_j = &(system->reaxprm.sbp[ type_j ]);
             twbp = &(system->reaxprm.tbp[ type_i ][ type_j ]);
 
             sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
             sum_ovun2 += (workspace->Delta[j] - dfvl * workspace->Delta_lp_temp[j]) *
                          ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n",
-            i+1, j+1,
-            dfvl * workspace->Delta_lp_temp[j],
-            sbp_j->nlp_opt,
-            workspace->nlp_temp[j] );*/
         }
 
         exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
diff --git a/sPuReMD/src/system_props.c b/sPuReMD/src/system_props.c
index fc93a474cf378f1a382d0ae017cf15a9b23eb17a..fad302b4c08b46a8b9d92d18f9a2a24a73e66e2f 100644
--- a/sPuReMD/src/system_props.c
+++ b/sPuReMD/src/system_props.c
@@ -25,7 +25,7 @@
 
 
 void Temperature_Control( control_params *control, simulation_data *data,
-                          output_controls *out_control )
+        output_controls *out_control )
 {
     real tmp;
 
@@ -34,17 +34,24 @@ void Temperature_Control( control_params *control, simulation_data *data,
         if ( (data->step - data->prev_steps) %
                 ((int)(control->T_freq / control->dt)) == 0 )
         {
-            if ( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+            if ( FABS( control->T - control->T_final ) >= FABS( control->T_rate ) )
+            {
                 control->T += control->T_rate;
-            else control->T = control->T_final;
+            }
+            else
+            {
+                control->T = control->T_final;
+            }
         }
     }
     else if ( control->T_mode == 2 )  // constant slope control
     {
         tmp = control->T_rate * control->dt / control->T_freq;
 
-        if ( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+        if ( FABS( control->T - control->T_final ) >= FABS( tmp ) )
+        {
             control->T += tmp;
+        }
     }
 }
 
@@ -53,18 +60,19 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
     int i;
 
-    data->M = 0;
+    data->M = 0.0;
 
     for ( i = 0; i < system->N; i++ )
+    {
         data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;
+    }
 
-    //fprintf ( stderr, "Compute_total_Mass -->%f<-- \n", data->M );
-    data->inv_M = 1. / data->M;
+    data->inv_M = 1.0 / data->M;
 }
 
 
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-                             FILE *fout )
+        FILE *fout )
 {
     int i;
     real m, xx, xy, xz, yy, yz, zz, det;
@@ -76,7 +84,6 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     rvec_MakeZero( data->amcm ); // angular momentum of CoM
     rvec_MakeZero( data->avcm ); // angular velocity of CoM
 
-
     /* Compute the position, velocity and angular momentum about the CoM */
     for ( i = 0; i < system->N; ++i )
     {
@@ -87,15 +94,6 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 
         rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
         rvec_ScaledAdd( data->amcm, m, tvec );
-
-        /*fprintf( fout,"%3d  %g %g %g\n",
-          i+1,
-          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-          fprintf( fout, "vcm:  %g %g %g\n",
-          data->vcm[0], data->vcm[1], data->vcm[2] );
-        */
-        /* fprintf( stderr, "amcm: %12.6f %12.6f %12.6f\n",
-           data->amcm[0], data->amcm[1], data->amcm[2] ); */
     }
 
     rvec_Scale( data->xcm, data->inv_M, data->xcm );
@@ -147,10 +145,14 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
     inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
-    if ( fabs(det) > ALMOST_ZERO )
+    if ( FABS(det) > ALMOST_ZERO )
+    {
         rtensor_Scale( inv, 1. / det, inv );
+    }
     else
+    {
         rtensor_MakeZero( inv );
+    }
 
     /* Compute the angular velocity about the centre of mass */
     rtensor_MatVec( data->avcm, inv, data->amcm );
@@ -186,7 +188,7 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 
     data->E_Kin = 0.0;
 
-    for (i = 0; i < system->N; i++)
+    for ( i = 0; i < system->N; i++ )
     {
         m = system->reaxprm.sbp[system->atoms[i].type].mass;
 
@@ -200,8 +202,10 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 
     data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
 
-    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
+    if ( FABS(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
+    {
         data->therm.T = ALMOST_ZERO;
+    }
 }
 
 
@@ -214,8 +218,7 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
  *  We may want to add that for more accuracy.
  */
 void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
-                                 simulation_data* data,
-                                 output_controls *out_control )
+        simulation_data* data, output_controls *out_control )
 {
     int i;
     reax_atom *p_atom;
@@ -273,8 +276,7 @@ void Compute_Pressure_Isotropic( reax_system* system, control_params *control,
 }
 
 
-void Compute_Pressure_Isotropic_Klein( reax_system* system,
-                                       simulation_data* data )
+void Compute_Pressure_Isotropic_Klein( reax_system* system, simulation_data* data )
 {
     int i;
     reax_atom *p_atom;
@@ -301,7 +303,7 @@ void Compute_Pressure_Isotropic_Klein( reax_system* system,
 
 
 void Compute_Pressure( reax_system* system, simulation_data* data,
-                       static_storage *workspace )
+        static_storage *workspace )
 {
     int i;
     reax_atom *p_atom;
diff --git a/sPuReMD/src/testmd.c b/sPuReMD/src/testmd.c
index c4c6645375434287bc670b0317ed116254502d46..d1159ef12df83940476f795498d04b35159b997c 100644
--- a/sPuReMD/src/testmd.c
+++ b/sPuReMD/src/testmd.c
@@ -47,7 +47,7 @@ static void Post_Evolve( reax_system * const system,
     /* if velocity dependent force then
        {
        Generate_Neighbor_Lists( &system, &control, &lists );
-       QEq(system, control, workspace, lists[FAR_NBRS]);
+       Compute_Charges(system, control, workspace, lists[FAR_NBRS]);
        Introduce compute_force here if we are using velocity dependent forces
        Compute_Forces(system,control,data,workspace,lists);
        } */
@@ -135,6 +135,9 @@ void static Read_System( char * const geo_file,
         exit( INVALID_GEO );
     }
 
+    fclose( ffield );
+    fclose( ctrl );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "input files have been read...\n" );
     Print_Box( &(system->box), stderr );
@@ -168,16 +171,16 @@ int main(int argc, char* argv[])
     lists = (list*) malloc( sizeof(list) * LIST_N );
 
     Read_System( argv[1], argv[2], argv[3], &system, &control,
-                 &data, &workspace, &out_control );
+            &data, &workspace, &out_control );
 
     Initialize( &system, &control, &data, &workspace, &lists,
-                &out_control, &Evolve );
+            &out_control, &Evolve );
 
     /* compute f_0 */
     //if( control.restart == 0 ) {
     Reset( &system, &control, &data, &workspace, &lists );
-    Generate_Neighbor_Lists( &system, &control, &data, &workspace,
-                             &lists, &out_control );
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace, 
+            &lists, &out_control );
 
     //fprintf( stderr, "total: %.2f secs\n", data.timing.nbrs);
     Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
@@ -186,8 +189,7 @@ int main(int argc, char* argv[])
     ++data.step;
     //}
     //
-
-
+    
     for ( ; data.step <= control.nsteps; data.step++ )
     {
         if ( control.T_mode )
@@ -196,13 +198,15 @@ int main(int argc, char* argv[])
         }
         Evolve( &system, &control, &data, &workspace, &lists, &out_control );
         Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
-        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+        Output_Results( &system, &control, &data, &workspace, &lists, &out_control );
         Analysis( &system, &control, &data, &workspace, &lists, &out_control );
 
         steps = data.step - data.prev_steps;
         if ( steps && out_control.restart_freq &&
                 steps % out_control.restart_freq == 0 )
+        {
             Write_Restart( &system, &control, &data, &workspace, &out_control );
+        }
     }
 
     if ( out_control.write_steps > 0 )
@@ -215,5 +219,10 @@ int main(int argc, char* argv[])
     data.timing.elapsed = Get_Timing_Info( data.timing.start );
     fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
 
+    Finalize( &system, &control, &data, &workspace, &lists,
+            &out_control );
+
+    free( lists );
+
     return SUCCESS;
 }
diff --git a/sPuReMD/src/three_body_interactions.c b/sPuReMD/src/three_body_interactions.c
index 078d951a159f7c11c1d13a8345dc25ce7b055620..c54def418822fc7c2fb325ecf571b7c7940bd8b6 100644
--- a/sPuReMD/src/three_body_interactions.c
+++ b/sPuReMD/src/three_body_interactions.c
@@ -28,11 +28,17 @@
 
 /* calculates the theta angle between i-j-k */
 void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-                      real *theta, real *cos_theta )
+        real *theta, real *cos_theta )
 {
     (*cos_theta) = rvec_Dot( dvec_ji, dvec_jk ) / ( d_ji * d_jk );
-    if ( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if ( *cos_theta < -1. ) *cos_theta  = -1.0;
+    if ( *cos_theta > 1.0 )
+    {
+        *cos_theta  = 1.0;
+    }
+    if ( *cos_theta < -1.0 )
+    {
+        *cos_theta  = -1.0;
+    }
 
     (*theta) = ACOS( *cos_theta );
 }
@@ -40,27 +46,28 @@ void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
 void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-                           rvec* dcos_theta_di, rvec* dcos_theta_dj,
-                           rvec* dcos_theta_dk )
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, rvec* dcos_theta_dk )
 {
-    int  t;
-    real sqr_d_ji   = SQR(d_ji);
-    real sqr_d_jk   = SQR(d_jk);
-    real inv_dists  = 1.0 / (d_ji * d_jk);
-    real inv_dists3 = POW( inv_dists, 3 );
-    real dot_dvecs  = rvec_Dot( dvec_ji, dvec_jk );
-    real Cdot_inv3  = dot_dvecs * inv_dists3;
+    int t;
+    real sqr_d_ji, sqr_d_jk, inv_dists, inv_dists3, dot_dvecs, Cdot_inv3;
+
+    sqr_d_ji = SQR( d_ji );
+    sqr_d_jk = SQR( d_jk );
+    inv_dists = 1.0 / (d_ji * d_jk);
+    inv_dists3 = POW( inv_dists, 3 );
+    dot_dvecs = rvec_Dot( dvec_ji, dvec_jk );
+    Cdot_inv3 = dot_dvecs * inv_dists3;
 
     for ( t = 0; t < 3; ++t )
     {
         (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists -
-                              Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
 
         (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-                              Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
 
         (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists -
-                              Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
     }
 
     /*fprintf( stderr,
@@ -72,497 +79,566 @@ void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
 /* this is a 3-body interaction in which the main role is
    played by j which sits in the middle of the other two. */
 void Three_Body_Interactions( reax_system *system, control_params *control,
-                              simulation_data *data, static_storage *workspace,
-                              list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int  i, j, pi, k, pk, t;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, start_pk, end_pk;
-    int  flag, cnt, num_thb_intrs;
-
-    real temp, temp_bo_jt, pBOjt7;
-    real p_val1, p_val2, p_val3, p_val4, p_val5;
-    real p_val6, p_val7, p_val8, p_val9, p_val10;
-    real p_pen1, p_pen2, p_pen3, p_pen4;
-    real p_coa1, p_coa2, p_coa3, p_coa4;
-    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
-    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-    real CEpen1, CEpen2, CEpen3;
-    real e_ang, e_coa, e_pen;
-    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-    real Cf7ij, Cf7jk, Cf8j, Cf9j;
-    real f7_ij, f7_jk, f8_Dj, f9_Dj;
-    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-    real r_ij, r_jk;
-    real BOA_ij, BOA_jk;
-    real vlpadj;
-    rvec force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
     real *total_bo;
-    three_body_header *thbh;
-    three_body_parameters *thbp;
-    three_body_interaction_data *p_ijk, *p_kji;
-    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-    bond_order_data *bo_ij, *bo_jk, *bo_jt;
     list *bonds, *thb_intrs;
     bond_data *bond_list;
     three_body_interaction_data *thb_list;
+    real p_pen2, p_pen3, p_pen4;
+    real p_coa2, p_coa3, p_coa4;
+    real p_val6, p_val8, p_val9, p_val10;
+    int num_thb_intrs;
+    real e_ang_total, e_pen_total, e_coa_total;
 
     total_bo = workspace->total_bond_order;
     bonds = (*lists) + BONDS;
     bond_list = bonds->select.bond_list;
     thb_intrs = (*lists) + THREE_BODIES;
     thb_list = thb_intrs->select.three_body_list;
-
     /* global parameters used in these calculations */
+    p_pen2 = system->reaxprm.gp.l[19];
+    p_pen3 = system->reaxprm.gp.l[20];
+    p_pen4 = system->reaxprm.gp.l[21];
+    p_coa2 = system->reaxprm.gp.l[2];
+    p_coa3 = system->reaxprm.gp.l[38];
+    p_coa4 = system->reaxprm.gp.l[30];
     p_val6 = system->reaxprm.gp.l[14];
     p_val8 = system->reaxprm.gp.l[33];
     p_val9 = system->reaxprm.gp.l[16];
     p_val10 = system->reaxprm.gp.l[17];
     num_thb_intrs = 0;
+    e_ang_total = 0.0;
+    e_pen_total = 0.0;
+    e_coa_total = 0.0;
 
-    for ( j = 0; j < system->N; ++j )
+    //TODO: change interaction lists for parallelization
+#ifdef _OPENMP
+//    #pragma omp parallel default(shared) reduction(+:total_Eang, total_Epen, total_Ecoa, num_thb_intrs) 
+#endif
     {
-        // fprintf( out_control->eval, "j: %d\n", j );
-        type_j = system->atoms[j].type;
-        start_j = Start_Index(j, bonds);
-        end_j = End_Index(j, bonds);
-
-        p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
-        p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
-
-        SBOp = 0, prod_SBO = 1;
-        for ( t = start_j; t < end_j; ++t )
-        {
-            bo_jt = &(bond_list[t].bo_data);
-            SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-            temp = SQR( bo_jt->BO );
-            temp *= temp;
-            temp *= temp;
-            prod_SBO *= EXP( -temp );
-        }
+        int i, j, pi, k, pk, t;
+        int type_i, type_j, type_k;
+        int start_j, end_j, start_pk, end_pk;
+        int cnt;
+        real temp, temp_bo_jt, pBOjt7;
+        real p_val1, p_val2, p_val3, p_val4, p_val5, p_val7;
+        real p_pen1;
+        real p_coa1;
+        real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+        real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+        real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
+        real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+        real CEpen1, CEpen2, CEpen3;
+        real e_ang, e_coa, e_pen;
+        real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+        real Cf7ij, Cf7jk, Cf8j, Cf9j;
+        real f7_ij, f7_jk, f8_Dj, f9_Dj;
+        real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+        real BOA_ij, BOA_jk;
+        real vlpadj;
+        rvec force, ext_press;
+        //rtensor temp_rtensor, total_rtensor;
+        three_body_header *thbh;
+        three_body_parameters *thbp;
+        three_body_interaction_data *p_ijk, *p_kji;
+        bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+        bond_order_data *bo_ij, *bo_jk, *bo_jt;
+        rvec *f_i, *f_j, *f_k;
+#ifdef _OPENMP
+//        int tid = omp_get_thread_num( );
+#endif
 
-        /* modifications to match Adri's code - 09/01/09 */
-        if ( workspace->vlpex[j] >= 0 )
+        for ( j = 0; j < system->N; ++j )
         {
-            vlpadj = 0;
-            dSBO2 = prod_SBO - 1;
-        }
-        else
-        {
-            vlpadj = workspace->nlp[j];
-            dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-        }
-
-        SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-        dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+            // fprintf( out_control->eval, "j: %d\n", j );
+            type_j = system->atoms[j].type;
+            start_j = Start_Index(j, bonds);
+            end_j = End_Index(j, bonds);
+//#ifdef _OPENMP
+//            f_j = &(workspace->f_local[tid * system->N + j]);
+//#else
+            f_j = &(system->atoms[j].f);
+//#endif
+
+            p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
+            p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
+
+            SBOp = 0.0;
+            prod_SBO = 1.0;
+            for ( t = start_j; t < end_j; ++t )
+            {
+                bo_jt = &(bond_list[t].bo_data);
+                SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+                temp = SQR( bo_jt->BO );
+                temp *= temp;
+                temp *= temp;
+                prod_SBO *= EXP( -temp );
+            }
 
-        if ( SBO <= 0 )
-            SBO2 = 0, CSBO2 = 0;
-        else if ( SBO > 0 && SBO <= 1 )
-        {
-            SBO2 = POW( SBO, p_val9 );
-            CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-        }
-        else if ( SBO > 1 && SBO < 2 )
-        {
-            SBO2 = 2 - POW( 2 - SBO, p_val9 );
-            CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-        }
-        else
-            SBO2 = 2, CSBO2 = 0;
+            /* modifications to match Adri's code - 09/01/09 */
+            if ( workspace->vlpex[j] >= 0.0 )
+            {
+                vlpadj = 0.0;
+                dSBO2 = prod_SBO - 1.0;
+            }
+            else
+            {
+                vlpadj = workspace->nlp[j];
+                dSBO2 = (prod_SBO - 1.0) * (1.0 - p_val8 * workspace->dDelta_lp[j]);
+            }
 
-        expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+            SBO = SBOp + (1.0 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+            dSBO1 = -8.0 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
 
-        /* unlike 2-body intrs where we enforce i<j, we cannot put any such
-           restrictions here. such a restriction would prevent us from producing
-           all 4-body intrs correctly */
-        for ( pi = start_j; pi < end_j; ++pi )
-        {
-            Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-            pbond_ij = &(bond_list[pi]);
-            bo_ij = &(pbond_ij->bo_data);
-            BOA_ij = bo_ij->BO - control->thb_cut;
+            if ( SBO <= 0.0 )
+            {
+                SBO2 = 0.0;
+                CSBO2 = 0.0;
+            }
+            else if ( SBO > 0.0 && SBO <= 1.0 )
+            {
+                SBO2 = POW( SBO, p_val9 );
+                CSBO2 = p_val9 * POW( SBO, p_val9 - 1.0 );
+            }
+            else if ( SBO > 1.0 && SBO < 2.0 )
+            {
+                SBO2 = 2.0 - POW( 2.0 - SBO, p_val9 );
+                CSBO2 = p_val9 * POW( 2.0 - SBO, p_val9 - 1.0 );
+            }
+            else
+            {
+                SBO2 = 2.0;
+                CSBO2 = 0.0;
+            }
 
+            expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
 
-            if ( BOA_ij/*bo_ij->BO*/ > 0.0 )
+            /* unlike 2-body intrs where we enforce i<j, we cannot put any such
+               restrictions here. such a restriction would prevent us from producing
+               all 4-body intrs correctly */
+            for ( pi = start_j; pi < end_j; ++pi )
             {
-                i = pbond_ij->nbr;
-                r_ij = pbond_ij->d;
-                type_i = system->atoms[i].type;
-                // fprintf( out_control->eval, "i: %d\n", i );
-
-
-                /* first copy 3-body intrs from previously computed ones where i>k.
-                   IMPORTANT: if it is less costly to compute theta and its
-                   derivative, we should definitely re-compute them,
-                   instead of copying!
-                   in the second for-loop below, we compute only new 3-body intrs
-                   where i < k */
-                for ( pk = start_j; pk < pi; ++pk )
+                Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+                pbond_ij = &(bond_list[pi]);
+                bo_ij = &(pbond_ij->bo_data);
+                BOA_ij = bo_ij->BO - control->thb_cut;
+
+                if ( BOA_ij > 0.0 )
                 {
-                    // fprintf( out_control->eval, "pk: %d\n", pk );
-                    start_pk = Start_Index( pk, thb_intrs );
-                    end_pk = End_Index( pk, thb_intrs );
+                    i = pbond_ij->nbr;
+                    type_i = system->atoms[i].type;
+//#ifdef _OPENMP
+//                    f_i = &(workspace->f_local[tid * system->N + i]);
+//#else
+                    f_i = &(system->atoms[i].f);
+//#endif
+
+                    /* first copy 3-body intrs from previously computed ones where i>k.
+                       IMPORTANT: if it is less costly to compute theta and its
+                       derivative, we should definitely re-compute them,
+                       instead of copying!
+                       in the second for-loop below, we compute only new 3-body intrs
+                       where i < k */
+                    for ( pk = start_j; pk < pi; ++pk )
+                    {
+                        // fprintf( out_control->eval, "pk: %d\n", pk );
+                        start_pk = Start_Index( pk, thb_intrs );
+                        end_pk = End_Index( pk, thb_intrs );
 
-                    for ( t = start_pk; t < end_pk; ++t )
-                        if ( thb_list[t].thb == i )
+                        for ( t = start_pk; t < end_pk; ++t )
                         {
-                            p_ijk = &(thb_list[num_thb_intrs]);
-                            p_kji = &(thb_list[t]);
-
-                            p_ijk->thb = bond_list[pk].nbr;
-                            p_ijk->pthb  = pk;
-                            p_ijk->theta = p_kji->theta;
-                            rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-                            rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-                            rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-
-                            ++num_thb_intrs;
-                            break;
+                            if ( thb_list[t].thb == i )
+                            {
+                                p_ijk = &(thb_list[num_thb_intrs]);
+                                p_kji = &(thb_list[t]);
+
+                                p_ijk->thb = bond_list[pk].nbr;
+                                p_ijk->pthb  = pk;
+                                p_ijk->theta = p_kji->theta;
+                                rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                                rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                                rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+
+                                ++num_thb_intrs;
+                                break;
+                            }
                         }
-                }
-
-
-                /* and this is the second for loop mentioned above */
-                for ( pk = pi + 1; pk < end_j; ++pk )
-                {
-                    pbond_jk = &(bond_list[pk]);
-                    bo_jk    = &(pbond_jk->bo_data);
-                    BOA_jk   = bo_jk->BO - control->thb_cut;
-                    k        = pbond_jk->nbr;
-                    type_k   = system->atoms[k].type;
-                    p_ijk    = &( thb_list[num_thb_intrs] );
-
-                    //CHANGE ORIGINAL
-                    if (BOA_jk <= 0) continue;
-                    //CHANGE ORIGINAL
-
-
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
-                                     pbond_jk->dvec, pbond_jk->d,
-                                     &theta, &cos_theta );
-
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
-                                          pbond_jk->dvec, pbond_jk->d,
-                                          &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
-                                          &(p_ijk->dcos_dk) );
+                    }
 
-                    p_ijk->thb = k;
-                    p_ijk->pthb = pk;
-                    p_ijk->theta = theta;
 
-                    sin_theta = SIN( theta );
-                    if ( sin_theta < 1.0e-5 )
-                        sin_theta = 1.0e-5;
+                    /* and this is the second for loop mentioned above */
+                    for ( pk = pi + 1; pk < end_j; ++pk )
+                    {
+                        pbond_jk = &(bond_list[pk]);
+                        bo_jk = &(pbond_jk->bo_data);
+                        BOA_jk = bo_jk->BO - control->thb_cut;
+                        k = pbond_jk->nbr;
+                        type_k = system->atoms[k].type;
+                        p_ijk = &( thb_list[num_thb_intrs] );
+//#ifdef _OPENMP
+//                        f_k = &(workspace->f_local[tid * system->N + k]);
+//#else
+                        f_k = &(system->atoms[k].f);
+//#endif
+
+                        //CHANGE ORIGINAL
+                        if ( BOA_jk <= 0 )
+                        {
+                            continue;
+                        }
+                        //CHANGE ORIGINAL
 
-                    ++num_thb_intrs;
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
+                                pbond_jk->dvec, pbond_jk->d,
+                                &theta, &cos_theta );
 
+                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d,
+                                pbond_jk->dvec, pbond_jk->d,
+                                &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                                &(p_ijk->dcos_dk) );
 
-                    if ( BOA_jk > 0.0 &&
-                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/)
-                    {
-                        r_jk = pbond_jk->d;
-                        thbh = &( system->reaxprm.thbp[type_i][type_j][type_k] );
-                        flag = 0;
+                        p_ijk->thb = k;
+                        p_ijk->pthb = pk;
+                        p_ijk->theta = theta;
 
-                        /* if( workspace->orig_id[i] < workspace->orig_id[k] )
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
-                           workspace->orig_id[i], workspace->orig_id[j],
-                           workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                           else
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
-                           workspace->orig_id[k], workspace->orig_id[j],
-                           workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+                        sin_theta = SIN( theta );
+                        if ( sin_theta < 1.0e-5 )
+                        {
+                            sin_theta = 1.0e-5;
+                        }
 
+                        ++num_thb_intrs;
 
-                        for ( cnt = 0; cnt < thbh->cnt; ++cnt )
+                        if ( BOA_jk > 0.0 &&
+                                (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/)
                         {
-                            // fprintf( out_control->eval,
-                            // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
+                            thbh = &( system->reaxprm.thbp[type_i][type_j][type_k] );
 
-                            if ( fabs(thbh->prm[cnt].p_val1) > 0.001 )
+                            /* if( workspace->orig_id[i] < workspace->orig_id[k] )
+                               fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
+                               workspace->orig_id[i], workspace->orig_id[j],
+                               workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                               else
+                               fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
+                               workspace->orig_id[k], workspace->orig_id[j],
+                               workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+
+                            for ( cnt = 0; cnt < thbh->cnt; ++cnt )
                             {
-                                thbp = &( thbh->prm[cnt] );
-
-                                /* ANGLE ENERGY */
-                                p_val1 = thbp->p_val1;
-                                p_val2 = thbp->p_val2;
-                                p_val4 = thbp->p_val4;
-                                p_val7 = thbp->p_val7;
-                                theta_00 = thbp->theta_00;
-
-                                exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-                                f7_ij = 1.0 - exp3ij;
-                                Cf7ij = p_val3 * p_val4 *
+                                if ( FABS(thbh->prm[cnt].p_val1) > 0.001 )
+                                {
+                                    thbp = &( thbh->prm[cnt] );
+
+                                    /* ANGLE ENERGY */
+                                    p_val1 = thbp->p_val1;
+                                    p_val2 = thbp->p_val2;
+                                    p_val4 = thbp->p_val4;
+                                    p_val7 = thbp->p_val7;
+                                    theta_00 = thbp->theta_00;
+
+                                    exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                                    f7_ij = 1.0 - exp3ij;
+                                    Cf7ij = p_val3 * p_val4 *
                                         POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
 
-                                exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-                                f7_jk = 1.0 - exp3jk;
-                                Cf7jk = p_val3 * p_val4 *
+                                    exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                                    f7_jk = 1.0 - exp3jk;
+                                    Cf7jk = p_val3 * p_val4 *
                                         POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
 
-                                expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-                                trm8 = 1.0 + expval6 + expval7;
-                                f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-                                Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-                                       (p_val6 * expval6 * trm8 -
-                                        (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
-
-                                theta_0 = 180.0 -
-                                          theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                                theta_0 = DEG2RAD( theta_0 );
-
-                                expval2theta  = EXP(-p_val2 * SQR(theta_0 - theta));
-                                if ( p_val1 >= 0 )
-                                    expval12theta = p_val1 * (1.0 - expval2theta);
-                                else // To avoid linear Me-H-Me angles (6/6/06)
-                                    expval12theta = p_val1 * -expval2theta;
-
-                                CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-                                CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-                                CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
-                                         expval2theta * (theta_0 - theta);
-
-                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
-                                           exp( -p_val10 * (2.0 - SBO2) );
-
-                                CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-                                CEval6 = CEval5 * dSBO1;
-                                CEval7 = CEval5 * dSBO2;
-                                CEval8 = -CEval4 / sin_theta;
-
-                                data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-                                /* END ANGLE ENERGY*/
-
-
-                                /* PENALTY ENERGY */
-                                p_pen1 = thbp->p_pen1;
-                                p_pen2 = system->reaxprm.gp.l[19];
-                                p_pen3 = system->reaxprm.gp.l[20];
-                                p_pen4 = system->reaxprm.gp.l[21];
-
-                                exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-                                exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-                                exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-                                exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-                                trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-                                f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 -
-                                        (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-                                                             p_pen4 * exp_pen4 )) /
-                                       SQR( trm_pen34 );
-
-                                data->E_Pen += e_pen =
-                                                   p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-
-                                CEpen1 = e_pen * Cf9j / f9_Dj;
-                                temp   = -2.0 * p_pen2 * e_pen;
-                                CEpen2 = temp * (BOA_ij - 2.0);
-                                CEpen3 = temp * (BOA_jk - 2.0);
-                                /* END PENALTY ENERGY */
-
-
-                                /* COALITION ENERGY */
-                                p_coa1 = thbp->p_coa1;
-                                p_coa2 = system->reaxprm.gp.l[2];
-                                p_coa3 = system->reaxprm.gp.l[38];
-                                p_coa4 = system->reaxprm.gp.l[30];
-
-                                exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-                                data->E_Coa += e_coa =
-                                                   p_coa1 / (1. + exp_coa2) *
-                                                   EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) *
-                                                   EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) *
-                                                   EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
-                                                   EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-
-                                CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-                                CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
-                                CEcoa4 = -2 * p_coa3 * (total_bo[i] - BOA_ij) * e_coa;
-                                CEcoa5 = -2 * p_coa3 * (total_bo[k] - BOA_jk) * e_coa;
-                                /* END COALITION ENERGY */
-
-                                /* FORCES */
-                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-                                workspace->CdDelta[j] += ((CEval3 + CEval7) +
-                                                          CEpen1 + CEcoa3);
-                                workspace->CdDelta[i] += CEcoa4;
-                                workspace->CdDelta[k] += CEcoa5;
-
-                                for ( t = start_j; t < end_j; ++t )
-                                {
-                                    pbond_jt = &( bond_list[t] );
-                                    bo_jt = &(pbond_jt->bo_data);
-                                    temp_bo_jt = bo_jt->BO;
-                                    temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt;
-
-                                    // fprintf( out_control->eval, "%6d%12.8f\n",
-                                    // workspace->orig_id[ bond_list[t].nbr ],
-                                    //    (CEval6 * pBOjt7) );
-
-                                    bo_jt->Cdbo += (CEval6 * pBOjt7);
-                                    bo_jt->Cdbopi += CEval5;
-                                    bo_jt->Cdbopi2 += CEval5;
-                                }
+                                    expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                                    trm8 = 1.0 + expval6 + expval7;
+                                    f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                                    Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                        (p_val6 * expval6 * trm8 -
+                                         (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+
+                                    theta_0 = 180.0 - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                                    theta_0 = DEG2RAD( theta_0 );
+
+                                    expval2theta  = EXP(-p_val2 * SQR(theta_0 - theta));
+                                    if ( p_val1 >= 0 )
+                                    {
+                                        expval12theta = p_val1 * (1.0 - expval2theta);
+                                    }
+                                    /* To avoid linear Me-H-Me angles (6/6/06) */
+                                    else
+                                    {
+                                        expval12theta = p_val1 * -expval2theta;
+                                    }
+
+                                    CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                                    CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                                    CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                                    CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
+                                        expval2theta * (theta_0 - theta);
+
+                                    Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
+                                        EXP( -p_val10 * (2.0 - SBO2) );
+
+                                    CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                                    CEval6 = CEval5 * dSBO1;
+                                    CEval7 = CEval5 * dSBO2;
+                                    CEval8 = -CEval4 / sin_theta;
+
+                                    e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                                    e_ang_total += e_ang;
+                                    /* END ANGLE ENERGY*/
+
+                                    /* PENALTY ENERGY */
+                                    p_pen1 = thbp->p_pen1;
+
+                                    exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                                    exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                                    exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                                    exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                                    trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                                    f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                                    Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 -
+                                            (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                                p_pen4 * exp_pen4 )) / SQR( trm_pen34 );
+
+                                    e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                    e_pen_total += e_pen;
+
+                                    CEpen1 = e_pen * Cf9j / f9_Dj;
+                                    temp = -2.0 * p_pen2 * e_pen;
+                                    CEpen2 = temp * (BOA_ij - 2.0);
+                                    CEpen3 = temp * (BOA_jk - 2.0);
+                                    /* END PENALTY ENERGY */
+
+                                    /* COALITION ENERGY */
+                                    p_coa1 = thbp->p_coa1;
+
+                                    exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                                    e_coa = p_coa1 / (1. + exp_coa2) *
+                                        EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) *
+                                        EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) *
+                                        EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                                        EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                    e_coa_total += e_coa;
+
+                                    CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                                    CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                                    CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                                    CEcoa4 = -2 * p_coa3 * (total_bo[i] - BOA_ij) * e_coa;
+                                    CEcoa5 = -2 * p_coa3 * (total_bo[k] - BOA_jk) * e_coa;
+                                    /* END COALITION ENERGY */
+
+                                    /* FORCES */
+#ifdef _OPENMP
+//                                    #pragma omp atomic
+#endif
+                                    bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+#ifdef _OPENMP
+//                                    #pragma omp atomic
+#endif
+                                    bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+#ifdef _OPENMP
+//                                    #pragma omp atomic
+#endif
+                                    workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+#ifdef _OPENMP
+//                                    #pragma omp atomic
+#endif
+                                    workspace->CdDelta[i] += CEcoa4;
+#ifdef _OPENMP
+//                                    #pragma omp atomic
+#endif
+                                    workspace->CdDelta[k] += CEcoa5;
+
+                                    for ( t = start_j; t < end_j; ++t )
+                                    {
+                                        pbond_jt = &( bond_list[t] );
+                                        bo_jt = &(pbond_jt->bo_data);
+                                        temp_bo_jt = bo_jt->BO;
+                                        temp = CUBE( temp_bo_jt );
+                                        pBOjt7 = temp * temp * temp_bo_jt;
+
+                                        // fprintf( out_control->eval, "%6d%12.8f\n",
+                                        // workspace->orig_id[ bond_list[t].nbr ],
+                                        //    (CEval6 * pBOjt7) );
+
+#ifdef _OPENMP
+//                                        #pragma omp atomic
+#endif
+                                        bo_jt->Cdbo += (CEval6 * pBOjt7);
+#ifdef _OPENMP
+//                                        #pragma omp atomic
+#endif
+                                        bo_jt->Cdbopi += CEval5;
+#ifdef _OPENMP
+//                                        #pragma omp atomic
+#endif
+                                        bo_jt->Cdbopi2 += CEval5;
+                                    }
+
+
+                                    if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                                    {
+                                        rvec_ScaledAdd( *f_i, CEval8, p_ijk->dcos_di );
+                                        rvec_ScaledAdd( *f_j, CEval8, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( *f_k, CEval8, p_ijk->dcos_dk );
+                                    }
+                                    else
+                                    {
+                                        /* terms not related to bond order derivatives
+                                           are added directly into
+                                           forces and pressure vector/tensor */
+                                        rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                        rvec_Add( *f_i, force );
+                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+#ifdef _OPENMP
+//                                        #pragma omp critical (Three_Body_Interactions_ext_press)
+#endif
+                                        {
+                                            rvec_Add( data->ext_press, ext_press );
+                                        }
 
+                                        rvec_ScaledAdd( *f_j, CEval8, p_ijk->dcos_dj );
 
-                                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
-                                {
-                                    rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
-                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                    rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
-                                }
-                                else
-                                {
-                                    /* terms not related to bond order derivatives
-                                       are added directly into
-                                       forces and pressure vector/tensor */
-                                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                    rvec_Add( system->atoms[i].f, force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data->ext_press, ext_press );
-
-                                    rvec_ScaledAdd( system->atoms[j].f,
-                                                    CEval8, p_ijk->dcos_dj );
-
-                                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                    rvec_Add( system->atoms[k].f, force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data->ext_press, ext_press );
-
-
-                                    /* This part is for a fully-flexible box */
-                                    /* rvec_OuterProduct( temp_rtensor,
-                                       p_ijk->dcos_di, system->atoms[i].x );
-                                       rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
-
-                                       rvec_OuterProduct( temp_rtensor,
-                                       p_ijk->dcos_dj, system->atoms[j].x );
-                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-                                       rvec_OuterProduct( temp_rtensor,
-                                       p_ijk->dcos_dk, system->atoms[k].x );
-                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-                                       if( pbond_ij->imaginary || pbond_jk->imaginary )
-                                       rtensor_ScaledAdd( data->flex_bar.P,
-                                       -1.0, total_rtensor );
-                                       else
-                                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                                }
+                                        rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                        rvec_Add( *f_k, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+#ifdef _OPENMP
+//                                        #pragma omp critical (Three_Body_Interactions_ext_press)
+#endif
+                                        {
+                                            rvec_Add( data->ext_press, ext_press );
+                                        }
+
+                                        /* This part is for a fully-flexible box */
+                                        /* rvec_OuterProduct( temp_rtensor,
+                                           p_ijk->dcos_di, system->atoms[i].x );
+                                           rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
+
+                                           rvec_OuterProduct( temp_rtensor,
+                                           p_ijk->dcos_dj, system->atoms[j].x );
+                                           rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                           rvec_OuterProduct( temp_rtensor,
+                                           p_ijk->dcos_dk, system->atoms[k].x );
+                                           rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                           if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                           rtensor_ScaledAdd( data->flex_bar.P,
+                                           -1.0, total_rtensor );
+                                           else
+                                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                    }
 
 #ifdef TEST_ENERGY
-                                fprintf( out_control->eval,
-                                         //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-                                         "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-                                         i + 1, j + 1, k + 1,
-                                         //workspace->orig_id[i]+1,
-                                         //workspace->orig_id[j]+1,
-                                         //workspace->orig_id[k]+1,
-                                         //workspace->Delta_boc[j],
-                                         RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-                                         e_ang, data->E_Ang );
-
-                                /*fprintf( out_control->eval,
-                                  "%23.15e%23.15e%23.15e%23.15e",
-                                  p_val3, p_val4, BOA_ij, BOA_jk );
-                                  fprintf( out_control->eval,
-                                  "%23.15e%23.15e%23.15e%23.15e",
-                                  f7_ij, f7_jk, f8_Dj, expval12theta );
-                                  fprintf( out_control->eval,
-                                  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  CEval1, CEval2, CEval3, CEval4, CEval5
-                                  //CEval6, CEval7, CEval8  );*/
-
-                                /*fprintf( out_control->eval,
-                                  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  -p_ijk->dcos_di[0]/sin_theta,
-                                  -p_ijk->dcos_di[1]/sin_theta,
-                                  -p_ijk->dcos_di[2]/sin_theta,
-                                  -p_ijk->dcos_dj[0]/sin_theta,
-                                  -p_ijk->dcos_dj[1]/sin_theta,
-                                  -p_ijk->dcos_dj[2]/sin_theta,
-                                  -p_ijk->dcos_dk[0]/sin_theta,
-                                  -p_ijk->dcos_dk[1]/sin_theta,
-                                  -p_ijk->dcos_dk[2]/sin_theta );*/
-
-                                /* fprintf( out_control->epen,
-                                   "%23.15e%23.15e%23.15e\n",
-                                   CEpen1, CEpen2, CEpen3 );
-                                   fprintf( out_control->epen,
-                                   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                   workspace->orig_id[i],  workspace->orig_id[j],
-                                   workspace->orig_id[k], RAD2DEG(theta),
-                                   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
-
-                                fprintf( out_control->ecoa,
-                                         "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                         workspace->orig_id[i],
-                                         workspace->orig_id[j],
-                                         workspace->orig_id[k],
-                                         RAD2DEG(theta), BOA_ij, BOA_jk,
-                                         e_coa, data->E_Coa );
+                                    fprintf( out_control->eval,
+                                             //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                                             "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                                             i + 1, j + 1, k + 1,
+                                             //workspace->orig_id[i]+1,
+                                             //workspace->orig_id[j]+1,
+                                             //workspace->orig_id[k]+1,
+                                             //workspace->Delta_boc[j],
+                                             RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                                             e_ang, data->E_Ang );
+
+                                    /*fprintf( out_control->eval,
+                                      "%23.15e%23.15e%23.15e%23.15e",
+                                      p_val3, p_val4, BOA_ij, BOA_jk );
+                                      fprintf( out_control->eval,
+                                      "%23.15e%23.15e%23.15e%23.15e",
+                                      f7_ij, f7_jk, f8_Dj, expval12theta );
+                                      fprintf( out_control->eval,
+                                      "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                      CEval1, CEval2, CEval3, CEval4, CEval5
+                                      //CEval6, CEval7, CEval8  );*/
+
+                                    /*fprintf( out_control->eval,
+                                      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                      -p_ijk->dcos_di[0]/sin_theta,
+                                      -p_ijk->dcos_di[1]/sin_theta,
+                                      -p_ijk->dcos_di[2]/sin_theta,
+                                      -p_ijk->dcos_dj[0]/sin_theta,
+                                      -p_ijk->dcos_dj[1]/sin_theta,
+                                      -p_ijk->dcos_dj[2]/sin_theta,
+                                      -p_ijk->dcos_dk[0]/sin_theta,
+                                      -p_ijk->dcos_dk[1]/sin_theta,
+                                      -p_ijk->dcos_dk[2]/sin_theta );*/
+
+                                    /* fprintf( out_control->epen,
+                                       "%23.15e%23.15e%23.15e\n",
+                                       CEpen1, CEpen2, CEpen3 );
+                                       fprintf( out_control->epen,
+                                       "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                       workspace->orig_id[i],  workspace->orig_id[j],
+                                       workspace->orig_id[k], RAD2DEG(theta),
+                                       BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
+
+                                    fprintf( out_control->ecoa,
+                                             "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                             workspace->orig_id[i],
+                                             workspace->orig_id[j],
+                                             workspace->orig_id[k],
+                                             RAD2DEG(theta), BOA_ij, BOA_jk,
+                                             e_coa, data->E_Coa );
 #endif
 
-#ifdef TEST_FORCES            /* angle forces */
-                                Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-                                Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-                                Add_dDelta( system, lists,
-                                            j, CEval3 + CEval7, workspace->f_ang );
-
-                                for ( t = start_j; t < end_j; ++t )
-                                {
-                                    pbond_jt = &( bond_list[t] );
-                                    bo_jt = &(pbond_jt->bo_data);
-                                    temp_bo_jt = bo_jt->BO;
-                                    temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt;
-
-                                    Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                                             workspace->f_ang );
-                                    Add_dBOpinpi2( system, lists, j, t,
-                                                   CEval5, CEval5,
-                                                   workspace->f_ang, workspace->f_ang );
-                                }
-
-                                rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-                                rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-                                /* end angle forces */
-
-                                /* penalty forces */
-                                Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-                                Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-                                Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-                                /* end penalty forces */
-
-                                /* coalition forces */
-                                Add_dBO( system, lists,
-                                         j, pi, CEcoa1 - CEcoa4, workspace->f_coa );
-                                Add_dBO( system, lists,
-                                         j, pk, CEcoa2 - CEcoa5, workspace->f_coa );
-                                Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-                                Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-                                Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-                                /* end coalition forces */
+#ifdef TEST_FORCES
+                                    /* angle forces */
+                                    Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                                    Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                                    Add_dDelta( system, lists,
+                                                j, CEval3 + CEval7, workspace->f_ang );
+
+                                    for ( t = start_j; t < end_j; ++t )
+                                    {
+                                        pbond_jt = &( bond_list[t] );
+                                        bo_jt = &(pbond_jt->bo_data);
+                                        temp_bo_jt = bo_jt->BO;
+                                        temp = CUBE( temp_bo_jt );
+                                        pBOjt7 = temp * temp * temp_bo_jt;
+
+                                        Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                                                 workspace->f_ang );
+                                        Add_dBOpinpi2( system, lists, j, t,
+                                                       CEval5, CEval5,
+                                                       workspace->f_ang, workspace->f_ang );
+                                    }
+
+                                    rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                                    rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                                    /* end angle forces */
+
+                                    /* penalty forces */
+                                    Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                                    Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                                    Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                                    /* end penalty forces */
+
+                                    /* coalition forces */
+                                    Add_dBO( system, lists,
+                                             j, pi, CEcoa1 - CEcoa4, workspace->f_coa );
+                                    Add_dBO( system, lists,
+                                             j, pk, CEcoa2 - CEcoa5, workspace->f_coa );
+                                    Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                                    Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                                    Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                                    /* end coalition forces */
 #endif
+                                }
                             }
                         }
                     }
                 }
-            }
 
-            Set_End_Index(pi, num_thb_intrs, thb_intrs );
+                Set_End_Index( pi, num_thb_intrs, thb_intrs );
+            }
         }
     }
 
+    data->E_Ang += e_ang_total;
+    data->E_Pen += e_pen_total;
+    data->E_Coa += e_coa_total;
 
     if ( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE )
     {
@@ -589,212 +665,272 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
 }
 
 
-
 void Hydrogen_Bonds( reax_system *system, control_params *control,
-                     simulation_data *data, static_storage *workspace,
-                     list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int i, j, k, pi, pk, itr, top;
-    int type_i, type_j, type_k;
-    int start_j, end_j, hb_start_j, hb_end_j;
-    int hblist[MAX_BONDS];
-    int num_hb_intrs = 0;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, force, ext_press;
-    ivec rel_jk;
-    // rtensor temp_rtensor, total_rtensor;
-    hbond_parameters *hbp;
-    bond_order_data *bo_ij;
-    bond_data *pbond_ij;
-    far_neighbor_data *nbr_jk;
-    list *bonds, *hbonds;
-    bond_data *bond_list;
-    hbond_data *hbond_list;
+    real e_hb_total;
 
-    bonds = (*lists) + BONDS;
-    bond_list = bonds->select.bond_list;
+    e_hb_total = 0.0;
 
-    hbonds = (*lists) + HBONDS;
-    hbond_list = hbonds->select.hbond_list;
+#ifdef _OPENMP
+    #pragma omp parallel default(shared) reduction(+: e_hb_total)
+#endif
+    {
+        int i, j, k, pi, pk, itr, top;
+        int type_i, type_j, type_k;
+        int start_j, end_j, hb_start_j, hb_end_j;
+        int hblist[MAX_BONDS];
+#ifdef TEST_FORCES
+        int num_hb_intrs;
+#endif
+        real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+        real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+        rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+        rvec dvec_jk, force, ext_press;
+        ivec rel_jk;
+        //rtensor temp_rtensor, total_rtensor;
+        hbond_parameters *hbp;
+        bond_order_data *bo_ij;
+        bond_data *pbond_ij;
+        far_neighbor_data *nbr_jk;
+        list *bonds, *hbonds;
+        bond_data *bond_list;
+        hbond_data *hbond_list;
+        rvec *f_i, *f_j, *f_k;
+#ifdef _OPENMP
+        int tid = omp_get_thread_num( );
+#endif
 
-    /* loops below discover the Hydrogen bonds between i-j-k triplets.
-       here j is H atom and there has to be some bond between i and j.
-       Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map
-       variables onto the ones in the handout.*/
-    for ( j = 0; j < system->N; ++j )
-        if ( system->reaxprm.sbp[system->atoms[j].type].p_hbond == 1 ) // j must be H
+#ifdef TEST_FORCES
+        num_hb_intrs = 0;
+#endif
+        bonds = (*lists) + BONDS;
+        bond_list = bonds->select.bond_list;
+        hbonds = (*lists) + HBONDS;
+        hbond_list = hbonds->select.hbond_list;
+
+        /* loops below discover the Hydrogen bonds between i-j-k triplets.
+           here j is H atom and there has to be some bond between i and j.
+           Hydrogen bond is between j and k.
+           so in this function i->X, j->H, k->Z when we map
+           variables onto the ones in the handout.*/
+#ifdef _OPENMP
+        #pragma omp for schedule(guided)
+#endif
+        for ( j = 0; j < system->N; ++j )
         {
-            /*set j's variables */
-            type_j  = system->atoms[j].type;
-            start_j = Start_Index(j, bonds);
-            end_j   = End_Index(j, bonds);
-            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-
-            top = 0;
-            for ( pi = start_j; pi < end_j; ++pi )
+            /* j must be H */
+            if ( system->reaxprm.sbp[system->atoms[j].type].p_hbond == 1 )
             {
-                pbond_ij = &( bond_list[pi] );
-                i = pbond_ij->nbr;
-                bo_ij = &(pbond_ij->bo_data);
-                type_i = system->atoms[i].type;
-
-                if ( system->reaxprm.sbp[type_i].p_hbond == 2 &&
-                        bo_ij->BO >= HB_THRESHOLD )
-                    hblist[top++] = pi;
-            }
-
-            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n",
-            //          j, top, hb_start_j, hb_end_j );
+                /* set j's variables */
+                type_j = system->atoms[j].type;
+                start_j = Start_Index( j, bonds );
+                end_j = End_Index( j, bonds );
+                hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+                hb_end_j = End_Index( workspace->hbond_index[j], hbonds );
+#ifdef _OPENMP
+                f_j = &(workspace->f_local[tid * system->N + j]);
+#else
+                f_j = &(system->atoms[j].f);
+#endif
 
-            for ( pk = hb_start_j; pk < hb_end_j; ++pk )
-            {
-                /* set k's varibles */
-                k = hbond_list[pk].nbr;
-                type_k = system->atoms[k].type;
-                nbr_jk = hbond_list[pk].ptr;
-                r_jk = nbr_jk->d;
-                rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-
-                for ( itr = 0; itr < top; ++itr )
+                top = 0;
+                for ( pi = start_j; pi < end_j; ++pi )
                 {
-                    pi = hblist[itr];
                     pbond_ij = &( bond_list[pi] );
                     i = pbond_ij->nbr;
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = system->atoms[i].type;
 
-                    if ( i != k )
+                    if ( system->reaxprm.sbp[type_i].p_hbond == 2 &&
+                            bo_ij->BO >= HB_THRESHOLD )
                     {
-                        bo_ij = &(pbond_ij->bo_data);
-                        type_i = system->atoms[i].type;
-                        r_ij = pbond_ij->d;
-                        hbp = &(system->reaxprm.hbp[ type_i ][ type_j ][ type_k ]);
-                        ++num_hb_intrs;
-
-                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                         &theta, &cos_theta );
-                        /* the derivative of cos(theta) */
-                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                              &dcos_theta_di, &dcos_theta_dj,
-                                              &dcos_theta_dk );
-
-                        /* hydrogen bond energy*/
-                        sin_theta2 = SIN( theta / 2.0 );
-                        sin_xhz4 = SQR(sin_theta2);
-                        sin_xhz4 *= sin_xhz4;
-                        cos_xhz1 = ( 1.0 - cos_theta );
-                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
-                                                       r_jk / hbp->r0_hb - 2.0 ) );
-
-                        data->E_HB += e_hb =
-                                          hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-
-                        CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
-                        CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) +
-                                                      1.0 / hbp->r0_hb);
-
-                        /* hydrogen bond forces */
-                        bo_ij->Cdbo += CEhb1;   // dbo term
-
-                        if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
-                        {
-                            rvec_ScaledAdd( system->atoms[i].f,
-                                            +CEhb2, dcos_theta_di ); //dcos terms
-                            rvec_ScaledAdd( system->atoms[j].f,
-                                            +CEhb2, dcos_theta_dj );
-                            rvec_ScaledAdd( system->atoms[k].f,
-                                            +CEhb2, dcos_theta_dk );
-                            //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
-                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3 / r_jk, dvec_jk );
-                        }
-                        else
+                        hblist[top++] = pi;
+                    }
+                }
+
+                for ( pk = hb_start_j; pk < hb_end_j; ++pk )
+                {
+                    /* set k's varibles */
+                    k = hbond_list[pk].nbr;
+                    type_k = system->atoms[k].type;
+                    nbr_jk = hbond_list[pk].ptr;
+                    r_jk = nbr_jk->d;
+                    rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+#ifdef _OPENMP
+                    f_k = &(workspace->f_local[tid * system->N + k]);
+#else
+                    f_k = &(system->atoms[k].f);
+#endif
+
+                    for ( itr = 0; itr < top; ++itr )
+                    {
+                        pi = hblist[itr];
+                        pbond_ij = &( bond_list[pi] );
+                        i = pbond_ij->nbr;
+
+                        if ( i != k )
                         {
-                            /* for pressure coupling, terms that are not related
-                               to bond order derivatives are added directly into
-                               pressure vector/tensor */
-                            rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-                            rvec_Add( system->atoms[i].f, force );
-                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-
-                            rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj );
-
-                            ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-                            rvec_Scale( force, +CEhb2, dcos_theta_dk );
-                            rvec_Add( system->atoms[k].f, force );
-                            rvec_iMultiply( ext_press, rel_jk, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-
-                            //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3 / r_jk, dvec_jk );
-
-                            rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
-                            rvec_Add( system->atoms[k].f, force );
-                            rvec_iMultiply( ext_press, rel_jk, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-
-                            /* This part is intended for a fully-flexible box */
-                            /* rvec_OuterProduct( temp_rtensor,
-                               dcos_theta_di, system->atoms[i].x );
-                               rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
-
-                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
-                               -CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor,
-                               temp_rvec, system->atoms[j].x );
-                               rtensor_Add( total_rtensor, temp_rtensor );
-
-                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
-                               +CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor,
-                               temp_rvec, system->atoms[k].x );
-                               rtensor_Add( total_rtensor, temp_rtensor );
-
-                               if( pbond_ij->imaginary || pbond_jk->imaginary )
-                               rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                               else
-                               rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                        }
+                            bo_ij = &(pbond_ij->bo_data);
+                            type_i = system->atoms[i].type;
+                            r_ij = pbond_ij->d;
+                            hbp = &(system->reaxprm.hbp[ type_i ][ type_j ][ type_k ]);
+#ifdef _OPENMP
+                            f_i = &(workspace->f_local[tid * system->N + i]);
+#else
+                            f_i = &(system->atoms[i].f);
+#endif
 
-#ifdef TEST_ENERGY
-                        /*fprintf( out_control->ehb,
-                          "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
-                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2],
-                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2],
-                          dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
-                          fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
-                          CEhb1, CEhb2, CEhb3 ); */
-                        fprintf( stderr, //out_control->ehb,
-                                 "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                 workspace->orig_id[i],
-                                 workspace->orig_id[j],
-                                 workspace->orig_id[k],
-                                 r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
+#ifdef TEST_FORCES
+                            ++num_hb_intrs;
+#endif
+
+                            Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                    &theta, &cos_theta );
+                            /* the derivative of cos(theta) */
+                            Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                    &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk );
+
+                            /* hydrogen bond energy */
+                            sin_theta2 = SIN( theta / 2.0 );
+                            sin_xhz4 = SQR( sin_theta2 );
+                            sin_xhz4 *= sin_xhz4;
+                            cos_xhz1 = ( 1.0 - cos_theta );
+                            exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                            exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
+                                        r_jk / hbp->r0_hb - 2.0 ) );
+
+                            e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                            e_hb_total += e_hb;
+
+                            CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+                            CEhb2 = -hbp->p_hb1 / 2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                            CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR( r_jk ) +
+                                    1.0 / hbp->r0_hb);
+
+                            /* hydrogen bond forces */
+                            /* dbo term,
+                             * note: safe to update across threads as this points
+                             * to the bond_order_data struct inside atom j's list,
+                             * and threads are partitioned across all j's */
+#ifdef _OPENMP
+                            #pragma omp atomic
+#endif
+                            bo_ij->Cdbo += CEhb1;
+
+                            if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                            {
+                                /* dcos terms */
+                                rvec_ScaledAdd( *f_i, +CEhb2, dcos_theta_di );
+                                rvec_ScaledAdd( *f_j, +CEhb2, dcos_theta_dj );
+                                rvec_ScaledAdd( *f_k, +CEhb2, dcos_theta_dk );
+
+                                /* dr terms */
+                                rvec_ScaledAdd( *f_j, -CEhb3 / r_jk, dvec_jk );
+                                rvec_ScaledAdd( *f_k, +CEhb3 / r_jk, dvec_jk );
+                            }
+                            else
+                            {
+                                /* for pressure coupling, terms that are not related
+                                   to bond order derivatives are added directly into
+                                   pressure vector/tensor */
+
+                                /* dcos terms */
+                                rvec_Scale( force, +CEhb2, dcos_theta_di );
+                                rvec_Add( *f_i, force );
+                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+#ifdef _OPENMP
+                                #pragma omp critical (Hydrogen_Bonds_ext_press)
+#endif
+                                {
+                                    rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                                }
+
+                                rvec_ScaledAdd( *f_j, +CEhb2, dcos_theta_dj );
+
+                                ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                                rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                                rvec_Add( *f_k, force );
+                                rvec_iMultiply( ext_press, rel_jk, force );
+#ifdef _OPENMP
+                                #pragma omp critical (Hydrogen_Bonds_ext_press)
+#endif
+                                {
+                                    rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                                }
+
+                                /* dr terms */
+                                rvec_ScaledAdd( *f_j, -CEhb3 / r_jk, dvec_jk );
+
+                                rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
+                                rvec_Add( *f_k, force );
+                                rvec_iMultiply( ext_press, rel_jk, force );
+#ifdef _OPENMP
+                                #pragma omp critical (Hydrogen_Bonds_ext_press)
+#endif
+                                {
+                                    rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                                }
+
+                                /* This part is intended for a fully-flexible box */
+                                /* rvec_OuterProduct( temp_rtensor,
+                                   dcos_theta_di, system->atoms[i].x );
+                                   rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
+
+                                   rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
+                                   -CEhb3/r_jk, pbond_jk->dvec );
+                                   rvec_OuterProduct( temp_rtensor,
+                                   temp_rvec, system->atoms[j].x );
+                                   rtensor_Add( total_rtensor, temp_rtensor );
+
+                                   rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
+                                   +CEhb3/r_jk, pbond_jk->dvec );
+                                   rvec_OuterProduct( temp_rtensor,
+                                   temp_rvec, system->atoms[k].x );
+                                   rtensor_Add( total_rtensor, temp_rtensor );
+
+                                   if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                   rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                                   else
+                                   rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                            }
 
+#ifdef TEST_ENERGY
+                            /*fprintf( out_control->ehb,
+                              "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
+                              dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2],
+                              dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2],
+                              dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
+                              fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
+                              CEhb1, CEhb2, CEhb3 ); */
+                            fprintf( stderr, //out_control->ehb,
+                                     "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                     workspace->orig_id[i],
+                                     workspace->orig_id[j],
+                                     workspace->orig_id[k],
+                                     r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
 #endif
+
 #ifdef TEST_FORCES
-                        // dbo term
-                        Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
-                        // dcos terms
-                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
-                        rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
-                        // dr terms
-                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3 / r_jk, dvec_jk );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3 / r_jk, dvec_jk );
+                            /* dbo term */
+                            Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
+                            /* dcos terms */
+                            rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
+                            rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
+                            rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
+                            /* dr terms */
+                            rvec_ScaledAdd( workspace->f_hb[j], -CEhb3 / r_jk, dvec_jk );
+                            rvec_ScaledAdd( workspace->f_hb[k], +CEhb3 / r_jk, dvec_jk );
 #endif
+                        }
                     }
                 }
             }
         }
+    }
 
-    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n",
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
+    data->E_HB += e_hb_total;
 
 #ifdef TEST_FORCES
     fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
diff --git a/sPuReMD/src/tool_box.c b/sPuReMD/src/tool_box.c
index 776bb565f9cb21d7f9ef38259f42bc13fca4fb91..245a4b766853fca40c71ce14ad866a1b7d977402 100644
--- a/sPuReMD/src/tool_box.c
+++ b/sPuReMD/src/tool_box.c
@@ -290,6 +290,8 @@ void Trim_Spaces( char *element )
 /************ from system_props.c *************/
 real Get_Time( )
 {
+    struct timeval tim;
+
     gettimeofday(&tim, NULL );
     return ( tim.tv_sec + (tim.tv_usec / 1000000.0) );
 }
@@ -297,6 +299,9 @@ real Get_Time( )
 
 real Get_Timing_Info( real t_start )
 {
+    struct timeval tim;
+    real t_end;
+
     gettimeofday(&tim, NULL );
     t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
     return (t_end - t_start);
@@ -305,6 +310,9 @@ real Get_Timing_Info( real t_start )
 
 void Update_Timing_Info( real *t_start, real *timing )
 {
+    struct timeval tim;
+    real t_end;
+
     gettimeofday(&tim, NULL );
     t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
     *timing += (t_end - *t_start);
@@ -375,6 +383,21 @@ int Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
 }
 
 
+void Deallocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
+{
+    int i;
+
+    for ( i = 0; i < MAX_TOKENS; i++ )
+    {
+        free( (*tokens)[i] );
+    }
+
+    free( *line );
+    free( *backup );
+    free( *tokens );
+}
+
+
 int Tokenize( char* s, char*** tok )
 {
     char test[MAX_LINE];
diff --git a/sPuReMD/src/tool_box.h b/sPuReMD/src/tool_box.h
index 5712152d9f399fd2c92152a64a2e82d2aacc70dc..38e6ba0c674d37d982bf54ca1ea83045dbaa9df9 100644
--- a/sPuReMD/src/tool_box.h
+++ b/sPuReMD/src/tool_box.h
@@ -24,9 +24,6 @@
 
 #include "mytypes.h"
 
-struct timeval tim;
-real t_end;
-
 /* from box.h */
 void Transform( rvec, simulation_box*, char, rvec );
 void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
@@ -60,6 +57,7 @@ int Get_Atom_Type( reax_interaction*, char* );
 char *Get_Element( reax_system*, int );
 char *Get_Atom_Name( reax_system*, int );
 int Allocate_Tokenizer_Space( char**, char**, char*** );
+void Deallocate_Tokenizer_Space( char **, char **, char *** );
 int Tokenize( char*, char*** );
 
 /* from lammps */
diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c
index 81207a00720ee20143fe066cf939d41e7f244130..f679186d5e543e3e77a5aa9b49de4cb05338c462 100644
--- a/sPuReMD/src/traj.c
+++ b/sPuReMD/src/traj.c
@@ -53,7 +53,7 @@ int Write_Custom_Header(reax_system *system, control_params *control,
              control->bo_cut,
              control->thb_cut,
              control->hb_cut,
-             control->qeq_solver_q_err,
+             control->cm_solver_q_err,
              control->T_init,
              control->T_final,
              control->Tau_T,
diff --git a/sPuReMD/src/two_body_interactions.c b/sPuReMD/src/two_body_interactions.c
index cd005cfe4b0ec9dfe7b75edb2a20e756d7147f51..d53797776dc4f16e0f7e213fe1d287f9a7ca5287 100644
--- a/sPuReMD/src/two_body_interactions.c
+++ b/sPuReMD/src/two_body_interactions.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "two_body_interactions.h"
+
 #include "bond_orders.h"
 #include "list.h"
 #include "lookup.h"
@@ -27,19 +28,11 @@
 
 
 void Bond_Energy( reax_system *system, control_params *control,
-                  simulation_data *data, static_storage *workspace,
-                  list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    real ebond, pow_BOs_be2, exp_be12, CEbo;
-    real gp3, gp4, gp7, gp10, gp37;
-    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-    real decobdbo, decobdboua, decobdboub;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij;
+    int i;
+    real gp3, gp4, gp7, gp10, gp37, ebond_total;
     list *bonds;
 
     bonds = (*lists) + BONDS;
@@ -48,300 +41,360 @@ void Bond_Energy( reax_system *system, control_params *control,
     gp7 = system->reaxprm.gp.l[7];
     gp10 = system->reaxprm.gp.l[10];
     gp37 = (int) system->reaxprm.gp.l[37];
+    ebond_total = 0.0;
 
-    for ( i = 0; i < system->N; ++i )
-    {
-        start_i = Start_Index(i, bonds);
-        end_i = End_Index(i, bonds);
-        //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-        for ( pj = start_i; pj < end_i; ++pj )
-            if ( i < bonds->select.bond_list[pj].nbr )
+#ifdef _OPENMP
+//    #pragma omp parallel default(shared) reduction(+: ebond_total)
+#endif
+    { 
+        int j, pj;
+        int start_i, end_i;
+        int type_i, type_j;
+        real ebond, pow_BOs_be2, exp_be12, CEbo;
+        real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+        real decobdbo, decobdboua, decobdboub;
+        single_body_parameters *sbp_i, *sbp_j;
+        two_body_parameters *twbp;
+        bond_order_data *bo_ij;
+
+#ifdef _OPENMP
+//        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < system->N; ++i )
+        {
+            start_i = Start_Index(i, bonds);
+            end_i = End_Index(i, bonds);
+
+            for ( pj = start_i; pj < end_i; ++pj )
             {
-                /* set the pointers */
-                j = bonds->select.bond_list[pj].nbr;
-                type_i = system->atoms[i].type;
-                type_j = system->atoms[j].type;
-                sbp_i = &( system->reaxprm.sbp[type_i] );
-                sbp_j = &( system->reaxprm.sbp[type_j] );
-                twbp = &( system->reaxprm.tbp[type_i][type_j] );
-                bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-                /* calculate the constants */
-                pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-                exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-                CEbo = -twbp->De_s * exp_be12 *
-                       ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-
-                /* calculate the Bond Energy */
-                data->E_BE += ebond =
-                                  -twbp->De_s * bo_ij->BO_s * exp_be12
-                                  - twbp->De_p * bo_ij->BO_pi
-                                  - twbp->De_pp * bo_ij->BO_pi2;
-
-                /* calculate derivatives of Bond Orders */
-                bo_ij->Cdbo += CEbo;
-                bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-                bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+                if ( i < bonds->select.bond_list[pj].nbr )
+                {
+                    /* set the pointers */
+                    j = bonds->select.bond_list[pj].nbr;
+                    type_i = system->atoms[i].type;
+                    type_j = system->atoms[j].type;
+                    sbp_i = &( system->reaxprm.sbp[type_i] );
+                    sbp_j = &( system->reaxprm.sbp[type_j] );
+                    twbp = &( system->reaxprm.tbp[type_i][type_j] );
+                    bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+                    /* calculate the constants */
+                    pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+                    exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+                    CEbo = -twbp->De_s * exp_be12 *
+                           ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+
+                    /* calculate the Bond Energy */
+                    ebond = -twbp->De_s * bo_ij->BO_s * exp_be12
+                        - twbp->De_p * bo_ij->BO_pi
+                        - twbp->De_pp * bo_ij->BO_pi2;
+                    ebond_total += ebond;
+
+                    /* calculate derivatives of Bond Orders */
+                    bo_ij->Cdbo += CEbo;
+                    bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+                    bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n",
-                         workspace->orig_id[i], workspace->orig_id[j],
-                         // i+1, j+1,
-                         bo_ij->BO, ebond/*, data->E_BE*/ );
-                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n",
-                   workspace->orig_id[i], workspace->orig_id[j],
-                   CEbo, -twbp->De_p, -twbp->De_pp );*/
+                    fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n",
+                             workspace->orig_id[i], workspace->orig_id[j],
+                             // i+1, j+1,
+                             bo_ij->BO, ebond );
 #endif
+
 #ifdef TEST_FORCES
-                Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-                Add_dBOpinpi2( system, lists, i, pj,
-                               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp),
-                               workspace->f_be, workspace->f_be );
+                    Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+                    Add_dBOpinpi2( system, lists, i, pj,
+                                   -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp),
+                                   workspace->f_be, workspace->f_be );
 #endif
 
-                /* Stabilisation terminal triple bond */
-                if ( bo_ij->BO >= 1.00 )
-                {
-                    if ( gp37 == 2 ||
-                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) ||
-                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) )
+                    /* Stabilisation terminal triple bond */
+                    if ( bo_ij->BO >= 1.00 )
                     {
-                        // ba = SQR(bo_ij->BO - 2.50);
-                        exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-                        //oboa=abo(j1)-boa;
-                        //obob=abo(j2)-boa;
-                        exphua1 = EXP(-gp3 * (workspace->total_bond_order[i] - bo_ij->BO));
-                        exphub1 = EXP(-gp3 * (workspace->total_bond_order[j] - bo_ij->BO));
-                        //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-                        exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
-                        hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-
-                        estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                        //estrain(j1) = estrain(j1) + 0.50*estriph;
-                        //estrain(j2) = estrain(j2) + 0.50*estriph;
-                        data->E_BE += estriph;
-
-                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
-                                   ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.50) );
-                        decobdboua = -gp10 * exphu * hulpov *
-                                     (gp3 * exphua1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
-                        decobdboub = -gp10 * exphu * hulpov *
-                                     (gp3 * exphub1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
-
-                        bo_ij->Cdbo += decobdbo;
-                        workspace->CdDelta[i] += decobdboua;
-                        workspace->CdDelta[j] += decobdboub;
+                        if ( gp37 == 2 ||
+                                (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) ||
+                                (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) )
+                        {
+                            //ba = SQR(bo_ij->BO - 2.50);
+                            exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                            //oboa=abo(j1)-boa;
+                            //obob=abo(j2)-boa;
+                            exphua1 = EXP(-gp3 * (workspace->total_bond_order[i] - bo_ij->BO));
+                            exphub1 = EXP(-gp3 * (workspace->total_bond_order[j] - bo_ij->BO));
+                            //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
+                            exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
+                            hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+
+                            estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                            //estrain(j1) = estrain(j1) + 0.50*estriph;
+                            //estrain(j2) = estrain(j2) + 0.50*estriph;
+                            ebond_total += estriph;
+
+                            decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
+                                ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.50) );
+                            decobdboua = -gp10 * exphu * hulpov *
+                                (gp3 * exphua1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
+                            decobdboub = -gp10 * exphu * hulpov *
+                                (gp3 * exphub1 + 25.0 * gp4 * exphuov * hulpov * (exphua1 + exphub1));
+
+                            bo_ij->Cdbo += decobdbo;
+                            workspace->CdDelta[i] += decobdboua;
+                            workspace->CdDelta[j] += decobdboub;
+
 #ifdef TEST_ENERGY
-                        fprintf( out_control->ebond,
-                                 "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                                 workspace->orig_id[i], workspace->orig_id[j],
-                                 //i+1, j+1,
-                                 estriph, decobdbo, decobdboua, decobdboub );
+                            fprintf( out_control->ebond,
+                                     "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                                     workspace->orig_id[i], workspace->orig_id[j],
+                                     //i+1, j+1,
+                                     estriph, decobdbo, decobdboua, decobdboub );
 #endif
+
 #ifdef TEST_FORCES
-                        Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-                        Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-                        Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                            Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                            Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                            Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
 #endif
+                        }
                     }
                 }
             }
+        }
     }
+
+    data->E_BE += ebond_total;
 }
 
 
 void vdW_Coulomb_Energy( reax_system *system, control_params *control,
-                         simulation_data *data, static_storage *workspace,
-                         list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-    int  i, j, pj;
-    int  start_i, end_i;
-    real self_coef;
+    int i;
     real p_vdW1, p_vdW1i;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, r_ij, fn13, exp1, exp2;
-    real Tap, dTap, dfn13, CEvd, CEclmb;
-    real dr3gamij_1, dr3gamij_3;
-    real e_ele, e_vdW, e_core, de_core;
-    rvec temp, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
     list *far_nbrs;
+    real e_vdW_total, e_ele_total;
 
     p_vdW1 = system->reaxprm.gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
     far_nbrs = (*lists) + FAR_NBRS;
-    e_ele = 0;
-    e_vdW = 0;
-    e_core = 0;
-    de_core = 0;
+    e_vdW_total = 0.0;
+    e_ele_total = 0.0;
 
-    for ( i = 0; i < system->N; ++i )
+#ifdef _OPENMP
+    #pragma omp parallel default(shared) reduction(+: e_vdW_total, e_ele_total)
+#endif
     {
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
-        //     i, start_i, end_i );
+        int j, pj;
+        int start_i, end_i;
+        real self_coef;
+        real powr_vdW1, powgi_vdW1;
+        real tmp, r_ij, fn13, exp1, exp2;
+        real Tap, dTap, dfn13, CEvd, CEclmb;
+        real dr3gamij_1, dr3gamij_3;
+        real e_ele, e_vdW, e_core, de_core;
+        rvec temp, ext_press;
+        //rtensor temp_rtensor, total_rtensor;
+        two_body_parameters *twbp;
+        far_neighbor_data *nbr_pj;
+#ifdef _OPENMP
+        int tid;
+
+        tid = omp_get_thread_num( );
+#endif
+
+        e_ele = 0.0;
+        e_vdW = 0.0;
+        e_core = 0.0;
+        de_core = 0.0;
 
-        for ( pj = start_i; pj < end_i; ++pj )
-            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+#ifdef _OPENMP
+        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < system->N; ++i )
+        {
+            start_i = Start_Index( i, far_nbrs );
+            end_i = End_Index( i, far_nbrs );
+
+            for ( pj = start_i; pj < end_i; ++pj )
             {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j = nbr_pj->nbr;
-                r_ij = nbr_pj->d;
-                twbp = &(system->reaxprm.tbp[ system->atoms[i].type ]
-                         [ system->atoms[j].type ]);
-                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-
-                /* Calculate Taper and its derivative */
-                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
-                Tap = control->Tap7 * r_ij + control->Tap6;
-                Tap = Tap * r_ij + control->Tap5;
-                Tap = Tap * r_ij + control->Tap4;
-                Tap = Tap * r_ij + control->Tap3;
-                Tap = Tap * r_ij + control->Tap2;
-                Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;
-
-                dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
-                dTap = dTap * r_ij + 5 * control->Tap5;
-                dTap = dTap * r_ij + 4 * control->Tap4;
-                dTap = dTap * r_ij + 3 * control->Tap3;
-                dTap = dTap * r_ij + 2 * control->Tap2;
-                dTap += control->Tap1 / r_ij;
-
-                /*vdWaals Calculations*/
-                if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
                 {
-                    // shielding
-                    powr_vdW1 = POW(r_ij, p_vdW1);
-                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                    j = nbr_pj->nbr;
+                    r_ij = nbr_pj->d;
+                    twbp = &(system->reaxprm.tbp[ system->atoms[i].type ]
+                             [ system->atoms[j].type ]);
+                    self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+
+                    /* Calculate Taper and its derivative */
+                    // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                    Tap = control->Tap7 * r_ij + control->Tap6;
+                    Tap = Tap * r_ij + control->Tap5;
+                    Tap = Tap * r_ij + control->Tap4;
+                    Tap = Tap * r_ij + control->Tap3;
+                    Tap = Tap * r_ij + control->Tap2;
+                    Tap = Tap * r_ij + control->Tap1;
+                    Tap = Tap * r_ij + control->Tap0;
+
+                    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+                    dTap = dTap * r_ij + 5 * control->Tap5;
+                    dTap = dTap * r_ij + 4 * control->Tap4;
+                    dTap = dTap * r_ij + 3 * control->Tap3;
+                    dTap = dTap * r_ij + 2 * control->Tap2;
+                    dTap += control->Tap1 / r_ij;
+
+                    /* vdWaals Calculations */
+                    if ( system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3 )
+                    {
+                        /* shielding */
+                        powr_vdW1 = POW( r_ij, p_vdW1 );
+                        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1 );
 
-                    data->E_vdW += e_vdW =
-                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
+                        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
-                            POW(r_ij, p_vdW1 - 2.0);
+                        e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
+                        e_vdW_total += e_vdW;
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) -
-                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
-                                         (exp1 - exp2) * dfn13 );
-                }
-                else  // no shielding
-                {
-                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                            POW( r_ij, p_vdW1 - 2.0 );
 
-                    data->E_vdW += e_vdW =
-                                       self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
+                        CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) -
+                                Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                (exp1 - exp2) * dfn13 );
+                    }
+                    /* no shielding */
+                    else
+                    {
+                        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
-                                         (exp1 - exp2) );
-                }
+                        e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);
+                        e_vdW_total += e_vdW;
 
-                if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
-                {
-                    // innner wall
-                    e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
-                    e_vdW += self_coef * Tap * e_core;
-                    data->E_vdW += self_coef * Tap * e_core;
+                        CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                                Tap * twbp->D * (twbp->alpha / twbp->r_vdW) *
+                                (exp1 - exp2) );
+                    }
 
-                    de_core = -(twbp->acore / twbp->rcore) * e_core;
-                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-                }
+                    if ( system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3 )
+                    {
+                        /* innner wall */
+                        e_core = twbp->ecore * EXP( twbp->acore * (1.0 - (r_ij / twbp->rcore)) );
+                        e_vdW += self_coef * Tap * e_core;
+                        e_vdW_total += self_coef * Tap * e_core;
 
-                /*Coulomb Calculations*/
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                        de_core = -(twbp->acore / twbp->rcore) * e_core;
+                        CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+                    }
 
-                tmp = Tap / dr3gamij_3;
-                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-                data->E_Ele += e_ele =
-                                   self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+                    /* Coulomb Calculations */
+                    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
 
+                    tmp = Tap / dr3gamij_3;
+                    e_ele = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+                    e_ele_total += e_ele;
 
-                CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
-                         ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q*
-                  ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
+                    CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
+                             ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
 
+                    if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
+                    {
+#ifndef _OPENMP
+                        rvec_ScaledAdd( system->atoms[i].f,
+                                -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( system->atoms[j].f,
+                                +(CEvd + CEclmb), nbr_pj->dvec );
+#else
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + i],
+                                -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + j],
+                                +(CEvd + CEclmb), nbr_pj->dvec );
+#endif
+                    }
+                    /* NPT, iNPT or sNPT */
+                    else
+                    {
+                        /* for pressure coupling, terms not related to bond order
+                           derivatives are added directly into pressure vector/tensor */
+                        rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+#ifndef _OPENMP
+                        rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                        rvec_Add( system->atoms[j].f, temp );
+#else
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + i], -1., temp );
+                        rvec_Add( workspace->f_local[tid * system->N + j], temp );
+#endif
 
-                if ( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT )
-                {
-                    rvec_ScaledAdd( system->atoms[i].f,
-                                    -(CEvd + CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f,
-                                    +(CEvd + CEclmb), nbr_pj->dvec );
-                }
-                else   // NPT, iNPT or sNPT
-                {
-                    /* for pressure coupling, terms not related to bond order
-                       derivatives are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-                    rvec_Add( system->atoms[j].f, temp );
-
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    rvec_Add( data->ext_press, ext_press );
-
-                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)",
-                      i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
-
-                      fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
-
-                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",
-                      data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
-
-                    /* This part is intended for a fully-flexible box */
-                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec,
-                       system->atoms[i].x );
-                       rtensor_Scale( total_rtensor,
-                       F_C * -(CEvd + CEclmb), temp_rtensor );
-                       rvec_OuterProduct( temp_rtensor,
-                       nbr_pj->dvec, system->atoms[j].x );
-                       rtensor_ScaledAdd( total_rtensor,
-                       F_C * +(CEvd + CEclmb), temp_rtensor );
-
-                       if( nbr_pj->imaginary )
-                       // This is an external force due to an imaginary nbr
-                       rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                       else
-                       // This interaction is completely internal
-                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                }
+                        rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+#ifdef _OPENMP
+                        #pragma omp critical (vdW_Coulomb_Energy_ext_press)
+#endif
+                        {
+                            rvec_Add( data->ext_press, ext_press );
+                        }
+
+                        /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)",
+                          i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
+
+                          fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
+
+                          fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",
+                          data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+
+                        /* This part is intended for a fully-flexible box */
+                        /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec,
+                           system->atoms[i].x );
+                           rtensor_Scale( total_rtensor,
+                           F_C * -(CEvd + CEclmb), temp_rtensor );
+                           rvec_OuterProduct( temp_rtensor,
+                           nbr_pj->dvec, system->atoms[j].x );
+                           rtensor_ScaledAdd( total_rtensor,
+                           F_C * +(CEvd + CEclmb), temp_rtensor );
+
+                           if( nbr_pj->imaginary )
+                           // This is an external force due to an imaginary nbr
+                           rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                           else
+                           // This interaction is completely internal
+                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                    }
 
 #ifdef TEST_ENERGY
-                rvec_MakeZero( temp );
-                rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
-                fprintf( out_control->evdw,
-                         "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                         //i+1, j+1,
-                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
-                         r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
-
-                fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                         MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-                         MAX( workspace->orig_id[i], workspace->orig_id[j] ),
-                         r_ij, system->atoms[i].q, system->atoms[j].q,
-                         e_ele/*, data->E_Ele*/ );
+                    rvec_MakeZero( temp );
+                    rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
+                    fprintf( out_control->evdw,
+                             "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                             //i+1, j+1,
+                             MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                             MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                             r_ij, e_vdW, temp[0], temp[1], temp[2]/*, e_vdW_total*/ );
+
+                    fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                             MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                             MAX( workspace->orig_id[i], workspace->orig_id[j] ),
+                             r_ij, system->atoms[i].q, system->atoms[j].q,
+                             e_ele/*, e_ele_total*/ );
 #endif
+
 #ifdef TEST_FORCES
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
+                }
             }
+        }
     }
 
+    data->E_vdW = e_vdW_total;
+    data->E_Ele = e_ele_total;
+
     // fclose( fout );
 
     // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n",
@@ -350,7 +403,7 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
 
 void LR_vdW_Coulomb( reax_system *system, control_params *control,
-                     int i, int j, real r_ij, LR_data *lr )
+        int i, int j, real r_ij, LR_data *lr )
 {
     real p_vdW1 = system->reaxprm.gp.l[28];
     real p_vdW1i = 1.0 / p_vdW1;
@@ -399,14 +452,14 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
     dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
 
     lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
-               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
 
-    /*vdWaals Calculations*/
-    if (system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3)
+    /* vdWaals Calculations */
+    if ( system->reaxprm.gp.vdw_type == 1 || system->reaxprm.gp.vdw_type == 3 )
     {
         // shielding
-        powr_vdW1 = POW(r_ij, p_vdW1);
-        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+        powr_vdW1 = POW( r_ij, p_vdW1 );
+        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1 );
 
         fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
         exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
@@ -415,12 +468,13 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
         lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
         dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
-                POW(r_ij, p_vdW1 - 2.0);
+            POW( r_ij, p_vdW1 - 2.0 );
 
         lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
     }
-    else  // no shielding
+    /* no shielding */
+    else
     {
         exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
         exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
@@ -428,10 +482,10 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
         lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
         lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
     }
 
-    if (system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3)
+    if ( system->reaxprm.gp.vdw_type == 2 || system->reaxprm.gp.vdw_type == 3 )
     {
         // innner wall
         e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
@@ -443,7 +497,7 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
 
     /* Coulomb calculations */
     dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
 
     tmp = Tap / dr3gamij_3;
     lr->H = EV_to_KCALpMOL * tmp;
@@ -465,106 +519,148 @@ void LR_vdW_Coulomb( reax_system *system, control_params *control,
 
 
 void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
-                                   simulation_data *data,
-                                   static_storage *workspace, list **lists,
-                                   output_controls *out_control )
+        simulation_data *data, static_storage *workspace, list **lists,
+        output_controls *out_control )
 {
-    int i, j, pj, r, steps, update_freq, update_energies;
-    int type_i, type_j, tmin, tmax;
-    int start_i, end_i;
-    real r_ij, self_coef, base, dif;
-    real e_vdW, e_ele;
-    real CEvd, CEclmb;
-    rvec temp, ext_press;
-    far_neighbor_data *nbr_pj;
-    list *far_nbrs = (*lists) + FAR_NBRS;
-    LR_lookup_table *t;
+    int steps, update_freq, update_energies;
+    list *far_nbrs;
+    real e_vdW_total, e_ele_total;
 
+    far_nbrs = (*lists) + FAR_NBRS;
     steps = data->step - data->prev_steps;
     update_freq = out_control->energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
+    e_vdW_total = 0.0;
+    e_ele_total = 0.0;
 
-    for ( i = 0; i < system->N; ++i )
+#ifdef _OPENMP
+    #pragma omp parallel default(shared) reduction(+: e_vdW_total, e_ele_total)
+#endif
     {
-        type_i  = system->atoms[i].type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
+        int i, j, pj, r;
+        int type_i, type_j, tmin, tmax;
+        int start_i, end_i;
+        real r_ij, self_coef, base, dif;
+        real e_vdW, e_ele;
+        real CEvd, CEclmb;
+        rvec temp, ext_press;
+        far_neighbor_data *nbr_pj;
+        LR_lookup_table *t;
+#ifdef _OPENMP
+        int tid;
+
+        tid = omp_get_thread_num( );
+
+        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < system->N; ++i )
+        {
+            type_i = system->atoms[i].type;
+            start_i = Start_Index(i, far_nbrs);
+            end_i = End_Index(i, far_nbrs);
 
-        for ( pj = start_i; pj < end_i; ++pj )
-            if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+            for ( pj = start_i; pj < end_i; ++pj )
             {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j      = nbr_pj->nbr;
-                type_j = system->atoms[j].type;
-                r_ij   = nbr_pj->d;
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( LR[tmin][tmax] );
-
-                /* Cubic Spline Interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if ( r == 0 )  ++r;
-                base = (real)(r + 1) * t->dx;
-                dif = r_ij - base;
-                //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
-
-                if ( update_energies )
+                if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
                 {
-                    e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) * dif +
-                            t->vdW[r].a;
-                    e_vdW *= self_coef;
+                    nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                    j = nbr_pj->nbr;
+                    type_j = system->atoms[j].type;
+                    r_ij = nbr_pj->d;
+                    self_coef = (i == j) ? 0.5 : 1.0;
+                    tmin = MIN( type_i, type_j );
+                    tmax = MAX( type_i, type_j );
+                    t = &( LR[tmin][tmax] );
+
+                    /* Cubic Spline Interpolation */
+                    r = (int)(r_ij * t->inv_dx);
+                    if ( r == 0 )
+                    {
+                        ++r;
+                    }
+                    base = (real)(r + 1) * t->dx;
+                    dif = r_ij - base;
+                    //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
 
-                    e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
-                            t->ele[r].a;
-                    e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
+                    if ( update_energies )
+                    {
+                        e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) * dif +
+                                t->vdW[r].a;
+                        e_vdW *= self_coef;
 
-                    data->E_vdW += e_vdW;
-                    data->E_Ele += e_ele;
-                }
+                        e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
+                                t->ele[r].a;
+                        e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
 
-                CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif +
-                       t->CEvd[r].a;
-                CEvd *= self_coef;
-                //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
+                        e_vdW_total += e_vdW;
+                        e_ele_total += e_ele;
+                    }
 
-                CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif +
-                         t->CEclmb[r].a;
-                CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
+                    CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif +
+                           t->CEvd[r].a;
+                    CEvd *= self_coef;
+                    //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
 
-                if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
-                {
-                    rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
-                }
-                else   // NPT, iNPT or sNPT
-                {
-                    /* for pressure coupling, terms not related to bond order
-                       derivatives are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-                    rvec_Add( system->atoms[j].f, temp );
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    rvec_Add( data->ext_press, ext_press );
-                }
+                    CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif +
+                             t->CEclmb[r].a;
+                    CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
+
+                    if ( control->ensemble == NVE || control->ensemble == NVT  || control->ensemble == bNVT)
+                    {
+#ifndef _OPENMP
+                        rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
+#else
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + i],
+                                -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + j],
+                                +(CEvd + CEclmb), nbr_pj->dvec );
+#endif
+                    }
+                    else   // NPT, iNPT or sNPT
+                    {
+                        /* for pressure coupling, terms not related to bond order
+                           derivatives are added directly into pressure vector/tensor */
+                        rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+#ifndef _OPENMP
+                        rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                        rvec_Add( system->atoms[j].f, temp );
+#else
+                        rvec_ScaledAdd( workspace->f_local[tid * system->N + i], -1., temp );
+                        rvec_Add( workspace->f_local[tid * system->N + j], temp );
+#endif
+                        rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+#ifdef _OPENMP
+                        #pragma omp critical (Tabulated_vdW_Coulomb_Energy_ext_press)
+#endif
+                        {
+                        rvec_Add( data->ext_press, ext_press );
+                        }
+                    }
 
 #ifdef TEST_ENERGY
-                fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j],
-                        r_ij, e_vdW, data->E_vdW );
-                fprintf(out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j],
-                        r_ij, system->atoms[i].q, system->atoms[j].q,
-                        e_ele, data->E_Ele );
+                    fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+                            workspace->orig_id[i], workspace->orig_id[j],
+                            r_ij, e_vdW, data->E_vdW );
+                    fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                            workspace->orig_id[i], workspace->orig_id[j],
+                            r_ij, system->atoms[i].q, system->atoms[j].q,
+                            e_ele, data->E_Ele );
 #endif
+
 #ifdef TEST_FORCES
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
+                }
             }
+        }
     }
+
+    data->E_vdW += e_vdW_total;
+    data->E_Ele += e_ele_total;
 }
 
 
diff --git a/sPuReMD/src/vector.c b/sPuReMD/src/vector.c
index ba4a5ea3ce8da6f0b25f1e43a984dc6c8d7e4cb5..f320d857bc42c3731204c3b41639d6460d398526 100644
--- a/sPuReMD/src/vector.c
+++ b/sPuReMD/src/vector.c
@@ -32,14 +32,18 @@ inline int Vector_isZero( const real * const v, const unsigned int k )
 {
     unsigned int i;
 
-    #pragma omp master
+#ifdef _OPENMP
+    #pragma omp single
+#endif
     {
         ret = TRUE;
     }
 
+#ifdef _OPENMP
     #pragma omp barrier
 
     #pragma omp for reduction(&&: ret) schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         if ( FABS( v[i] ) > ALMOST_ZERO )
@@ -48,6 +52,10 @@ inline int Vector_isZero( const real * const v, const unsigned int k )
         }
     }
 
+#ifdef _OPENMP
+    #pragma omp barrier
+#endif
+
     return ret;
 }
 
@@ -56,7 +64,9 @@ inline void Vector_MakeZero( real * const v, const unsigned int k )
 {
     unsigned int i;
 
+#ifdef _OPENMP
     #pragma omp for schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         v[i] = ZERO;
@@ -68,7 +78,9 @@ inline void Vector_Copy( real * const dest, const real * const v, const unsigned
 {
     unsigned int i;
 
+#ifdef _OPENMP
     #pragma omp for schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         dest[i] = v[i];
@@ -80,7 +92,9 @@ inline void Vector_Scale( real * const dest, const real c, const real * const v,
 {
     unsigned int i;
 
+#ifdef _OPENMP
     #pragma omp for schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i];
@@ -93,7 +107,9 @@ inline void Vector_Sum( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
+#ifdef _OPENMP
     #pragma omp for schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i] + d * y[i];
@@ -105,7 +121,9 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
+#ifdef _OPENMP
     #pragma omp for schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         dest[i] += c * v[i];
@@ -114,16 +132,19 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c
 
 
 void Vector_Print( FILE * const fout, const char * const vname, const real * const v,
-                   const unsigned int k )
+        const unsigned int k )
 {
     unsigned int i;
 
-    fprintf( fout, "%s:\n", vname );
+    if ( vname != NULL )
+    {
+        fprintf( fout, "%s:\n", vname );
+    }
+
     for ( i = 0; i < k; ++i )
     {
         fprintf( fout, "%24.15e\n", v[i] );
     }
-    fprintf( fout, "\n" );
 }
 
 
@@ -131,20 +152,27 @@ inline real Dot( const real * const v1, const real * const v2, const unsigned in
 {
     unsigned int i;
 
-    #pragma omp master
+#ifdef _OPENMP
+    #pragma omp single
+#endif
     {
         ret2 = ZERO;
     }
 
+#ifdef _OPENMP
     #pragma omp barrier
 
-
     #pragma omp for reduction(+: ret2) schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
         ret2 += v1[i] * v2[i];
     }
 
+#ifdef _OPENMP
+    #pragma omp barrier
+#endif
+
     return ret2;
 }
 
@@ -153,20 +181,37 @@ inline real Norm( const real * const v1, const unsigned int k )
 {
     unsigned int i;
 
-    #pragma omp master
+#ifdef _OPENMP
+    #pragma omp single
+#endif
     {
         ret2 = ZERO;
     }
 
+#ifdef _OPENMP
     #pragma omp barrier
 
     #pragma omp for reduction(+: ret2) schedule(static)
+#endif
     for ( i = 0; i < k; ++i )
     {
-        ret2 +=  SQR( v1[i] );
+        ret2 += SQR( v1[i] );
     }
 
-    return SQRT( ret2 );
+#ifdef _OPENMP
+    #pragma omp barrier
+
+    #pragma omp single
+#endif
+    {
+        ret2 = SQRT( ret2 );
+    }
+
+#ifdef _OPENMP
+    #pragma omp barrier
+#endif
+
+    return ret2;
 }
 
 
@@ -307,9 +352,9 @@ inline real rvec_Norm( const rvec v )
 
 inline int rvec_isZero( const rvec v )
 {
-    if ( fabs(v[0]) > ALMOST_ZERO ||
-            fabs(v[1]) > ALMOST_ZERO ||
-            fabs(v[2]) > ALMOST_ZERO )
+    if ( FABS(v[0]) > ALMOST_ZERO ||
+            FABS(v[1]) > ALMOST_ZERO ||
+            FABS(v[2]) > ALMOST_ZERO )
     {
         return FALSE;
     }
@@ -319,7 +364,9 @@ inline int rvec_isZero( const rvec v )
 
 inline void rvec_MakeZero( rvec v )
 {
-    v[0] = v[1] = v[2] = ZERO;
+    v[0] = ZERO;
+    v[1] = ZERO;
+    v[2] = ZERO;
 }
 
 
diff --git a/sPuReMD/src/vector.h b/sPuReMD/src/vector.h
index 98ba7dd8c74203902646bb4711e99152df747b2d..27ceb241150bc98c7dcf65ef37fc8262d77303dc 100644
--- a/sPuReMD/src/vector.h
+++ b/sPuReMD/src/vector.h
@@ -24,6 +24,7 @@
 
 #include "mytypes.h"
 
+
 int Vector_isZero( const real * const, const unsigned int );
 void Vector_MakeZero( real * const, const unsigned int );
 void Vector_Copy( real * const, const real * const, const unsigned int );
@@ -80,4 +81,5 @@ void ivec_Scale( ivec, const real, const ivec );
 void ivec_rScale( ivec, const real, const rvec );
 void ivec_Sum( ivec, const ivec, const ivec );
 
+
 #endif
diff --git a/sPuReMD/tests/test_vector.cpp b/sPuReMD/tests/test_vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b58ff7df532dff2916642a6ce75d6c08b705354b
--- /dev/null
+++ b/sPuReMD/tests/test_vector.cpp
@@ -0,0 +1,74 @@
+#include <gtest/gtest.h>
+
+#include "mytypes.h"
+#include "vector.c"
+
+
+#define VEC_SIZE (100)
+
+
+namespace
+{
+    class VectorTest : public ::testing::Test
+    {
+        protected:
+            real *a;
+            real *b;
+
+            VectorTest( )
+            {
+                if ( (a = (real *) malloc( VEC_SIZE * sizeof(real))) == NULL ||
+                            (b = (real *) malloc( VEC_SIZE * sizeof(real))) == NULL )
+                {
+                    throw new std::bad_alloc( );
+                }
+            }
+
+            virtual ~VectorTest( )
+            {
+                if ( a != NULL )
+                {
+                    free( a );
+                }
+                if ( b != NULL )
+                {
+                    free( b );
+                }
+            }
+
+            virtual void SetUp( )
+            {
+                for ( int i = 0; i < VEC_SIZE; ++i )
+                {
+                    a[i] = i + 1.0;
+                    b[i] = 1.0;
+                }
+            }
+
+            virtual void TearDown( )
+            {
+
+            }
+    };
+
+
+    TEST_F(VectorTest, Dot)
+    {
+        ASSERT_EQ( Dot(a, b, VEC_SIZE), (VEC_SIZE * (VEC_SIZE + 1.0)) / 2.0 );
+        ASSERT_EQ( Dot(b, b, VEC_SIZE), (real) VEC_SIZE );
+    }
+
+
+    TEST_F(VectorTest, Norm)
+    {
+        ASSERT_EQ( Norm(a, VEC_SIZE), sqrt( VEC_SIZE * (VEC_SIZE + 1.0) * (2.0 * VEC_SIZE + 1.0) / 6.0 ) );
+        ASSERT_EQ( Norm(b, VEC_SIZE), sqrt( (real) VEC_SIZE ) );
+    }
+}
+
+
+int main( int argc, char **argv )
+{
+    ::testing::InitGoogleTest( &argc, argv );
+    return RUN_ALL_TESTS( );
+}
diff --git a/tools/run_sim.py b/tools/run_sim.py
index 3aa4ad5146db95404e6c7a062eac2612fd8c28eb..2ee79c24888d0c54a8a1f5c8da8886824524d549 100644
--- a/tools/run_sim.py
+++ b/tools/run_sim.py
@@ -32,24 +32,32 @@ class TestCase():
                     r'(?P<key>nsteps\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
                 'tabulate_long_range': lambda l, x: sub(
                     r'(?P<key>tabulate_long_range\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'qeq_solver_type': lambda l, x: sub(
-                    r'(?P<key>qeq_solver_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'qeq_solver_q_err': lambda l, x: sub(
-                    r'(?P<key>qeq_solver_q_err\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'qeq_domain_sparsity': lambda l, x: sub(
-                    r'(?P<key>qeq_domain_sparsity\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_comp_type': lambda l, x: sub(
-                    r'(?P<key>pre_comp_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_comp_droptol': lambda l, x: sub(
-                    r'(?P<key>pre_comp_droptol\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_comp_refactor': lambda l, x: sub(
-                    r'(?P<key>pre_comp_refactor\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_comp_sweeps': lambda l, x: sub(
-                    r'(?P<key>pre_comp_sweeps\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_app_type': lambda l, x: sub(
-                    r'(?P<key>pre_app_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
-                'pre_app_jacobi_iters': lambda l, x: sub(
-                    r'(?P<key>pre_app_jacobi_iters\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'charge_method': lambda l, x: sub(
+                    r'(?P<key>charge_method\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_q_net': lambda l, x: sub(
+                    r'(?P<key>cm_q_net\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_type': lambda l, x: sub(
+                    r'(?P<key>cm_solver_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_max_iters': lambda l, x: sub(
+                    r'(?P<key>cm_solver_max_iters\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_restart': lambda l, x: sub(
+                    r'(?P<key>cm_solver_restart\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_q_err': lambda l, x: sub(
+                    r'(?P<key>cm_solver_q_err\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_domain_sparsity': lambda l, x: sub(
+                    r'(?P<key>cm_domain_sparsity\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_comp_type': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_comp_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_comp_droptol': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_comp_droptol\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_comp_refactor': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_comp_refactor\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_comp_sweeps': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_comp_sweeps\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_app_type': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_app_type\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
+                'cm_solver_pre_app_jacobi_iters': lambda l, x: sub(
+                    r'(?P<key>cm_solver_pre_app_jacobi_iters\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
                 'geo_format': lambda l, x: sub(
                     r'(?P<key>geo_format\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
         }
@@ -88,15 +96,16 @@ class TestCase():
         for p in product(*[self.__params[k] for k in self.__param_names]):
             param_dict = dict((k, v) for (k, v) in zip(self.__param_names, p))
             param_dict['name'] = path.basename(self.__geo_file).split('.')[0] \
+                + '_cm' + param_dict['charge_method'] \
                 + '_s' + param_dict['nsteps'] \
-		+ '_q' + param_dict['qeq_solver_type'] \
- 		+ '_qtol' + param_dict['qeq_solver_q_err'] \
- 		+ '_qds' + param_dict['qeq_domain_sparsity'] \
-                + '_pc' + param_dict['pre_comp_type'] \
-                + '_pctol' + param_dict['pre_comp_droptol'] \
-                + '_pcs' + param_dict['pre_comp_sweeps'] \
-                + '_pa' + param_dict['pre_app_type'] \
-                + '_paji' + param_dict['pre_app_jacobi_iters'] \
+		+ '_q' + param_dict['cm_solver_type'] \
+ 		+ '_qtol' + param_dict['cm_solver_q_err'] \
+ 		+ '_qds' + param_dict['cm_domain_sparsity'] \
+                + '_pc' + param_dict['cm_solver_pre_comp_type'] \
+                + '_pctol' + param_dict['cm_solver_pre_comp_droptol'] \
+                + '_pcs' + param_dict['cm_solver_pre_comp_sweeps'] \
+                + '_pa' + param_dict['cm_solver_pre_app_type'] \
+                + '_paji' + param_dict['cm_solver_pre_app_jacobi_iters'] \
 		+ '_t' + param_dict['threads']
 
 
@@ -128,7 +137,7 @@ class TestCase():
 
     def _process_result(self, fout, param):
         time = 0.
-        qeq = 0.
+        cm = 0.
         iters = 0.
         pre_comp = 0.
         pre_app = 0.
@@ -143,7 +152,7 @@ class TestCase():
             for line in fp:
                 line = line.split()
                 try:
-                    qeq = qeq + float(line[6])
+                    cm = cm + float(line[6])
                     iters = iters + float(line[8])
                     pre_comp = pre_comp + float(line[9])
                     pre_app = pre_app + float(line[10])
@@ -159,7 +168,7 @@ class TestCase():
                         pass
             cnt = cnt - 1
             if cnt > 0:
-                qeq = qeq / cnt
+                cm = cm / cnt
                 iters = iters / cnt
                 pre_comp = pre_comp / cnt
                 pre_app = pre_app / cnt
@@ -167,10 +176,10 @@ class TestCase():
 
         if cnt == int(param['nsteps']):
             fout.write(self.__result_body_fmt.format(path.basename(self.__geo_file).split('.')[0], 
-                param['nsteps'], param['qeq_solver_type'], param['qeq_solver_q_err'], param['qeq_domain_sparsity'],
-                param['pre_comp_type'], param['pre_comp_droptol'], param['pre_comp_sweeps'],
-                param['pre_app_type'], param['pre_app_jacobi_iters'], pre_comp, pre_app, iters, spmv,
-                qeq, param['threads'], time))
+                param['nsteps'], param['cm_solver_type'], param['cm_solver_q_err'], param['cm_domain_sparsity'],
+                param['cm_solver_pre_comp_type'], param['cm_solver_pre_comp_droptol'], param['cm_solver_pre_comp_sweeps'],
+                param['cm_solver_pre_app_type'], param['cm_solver_pre_app_jacobi_iters'], pre_comp, pre_app, iters, spmv,
+                cm, param['threads'], time))
         else:
             print('**WARNING: nsteps not correct in file {0}...'.format(log_file))
         fout.flush()
@@ -205,22 +214,26 @@ if __name__ == '__main__':
 
     header_fmt_str = '{:15}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:10}|{:10}|{:10}|{:10}|{:10}|{:3}|{:10}\n'
     header_str = ['Data Set', 'Steps', 'QType', 'Q Tol', 'QDS', 'PreCT', 'PreCD', 'PreCS', 'PreAT', 'PreAJ', 'Pre Comp',
-            'Pre App', 'Iters', 'SpMV', 'QEq', 'Thd', 'Time (s)']
+            'Pre App', 'Iters', 'SpMV', 'CM', 'Thd', 'Time (s)']
     body_fmt_str = '{:15} {:5} {:5} {:5} {:5} {:5} {:5} {:5} {:5} {:5} {:10.3f} {:10.3f} {:10.3f} {:10.3f} {:10.3f} {:3} {:10.3f}\n'
 
     params = {
             'ensemble_type': ['0'],
             'nsteps': ['20'],
             'tabulate_long_range': ['0'],
-            'qeq_solver_type': ['0'],
-            'qeq_solver_q_err': ['1e-6'],
-            'qeq_domain_sparsity': ['1.0'],
-            'pre_comp_type': ['2'],
-            'pre_comp_refactor': ['100'],
-            'pre_comp_droptol': ['0.0'],
-            'pre_comp_sweeps': ['3'],
-            'pre_app_type': ['2'],
-            'pre_app_jacobi_iters': ['30'],
+            'charge_method': ['0'],
+            'cm_q_net': ['0.0'],
+            'cm_solver_type': ['0'],
+            'cm_solver_max_iters': ['20'],
+            'cm_solver_restart': ['100'],
+            'cm_solver_q_err': ['1e-6'],
+            'cm_domain_sparsity': ['1.0'],
+            'cm_solver_pre_comp_type': ['2'],
+            'cm_solver_pre_comp_refactor': ['100'],
+            'cm_solver_pre_comp_droptol': ['0.0'],
+            'cm_solver_pre_comp_sweeps': ['3'],
+            'cm_solver_pre_app_type': ['2'],
+            'cm_solver_pre_app_jacobi_iters': ['30'],
             'threads': ['1'],
             'geo_format': [],
     }