diff --git a/PuReMD/aclocal.m4 b/PuReMD/aclocal.m4
index 88932683661469f18a67e7010eaeb8e4984bfef0..90f53cebe3fa0d0bbe5e05636b1e76997e5d30a2 100644
--- a/PuReMD/aclocal.m4
+++ b/PuReMD/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.15.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.15.1], [],
+m4_if([$1], [1.15], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.15.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -110,7 +110,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997-2017 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -141,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -332,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -408,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -605,7 +605,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -626,7 +626,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])
 
-# Copyright (C) 2003-2017 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -647,7 +647,7 @@ AC_SUBST([am__leading_dot])])
 
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -697,7 +697,7 @@ rm -f confinc confmf
 
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997-2017 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +736,7 @@ fi
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -765,7 +765,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -812,7 +812,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -831,7 +831,7 @@ AC_DEFUN([AM_RUN_LOG],
 
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -912,7 +912,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])
 
-# Copyright (C) 2009-2017 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -972,7 +972,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1000,7 +1000,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006-2017 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1019,7 +1019,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004-2017 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
diff --git a/PuReMD/configure.ac b/PuReMD/configure.ac
index c5d0b3578696c4fcae9dd1438b228bdd0cdeabef..3df96c97ed2a807d88584a74297525fed8b8bec7 100644
--- a/PuReMD/configure.ac
+++ b/PuReMD/configure.ac
@@ -24,6 +24,39 @@ AC_DEFUN([CONFIGURE_HEADLINE],
         echo; echo "+++ $1 +++"
 ])
 
+# Check for LAPACKE
+AC_CHECK_HEADERS([mkl.h], [MKL_FOUND_HEADERS="yes"])
+if test "x${MKL_FOUND_HEADERS}" = "xyes"
+then
+    AC_SEARCH_LIBS([LAPACKE_dgels], [mkl_intel_ilp64],
+                   [MKL_FOUND_LIBS="yes"], [MKL_FOUND_LIBS="no"], 
+                   [-lmkl_sequential -lmkl_core -lpthread -lm -ldl])
+    AS_IF([test "x${MKL_FOUND_LIBS}" != "xyes"],
+          [AC_MSG_ERROR([Unable to find MKL LAPACKE library.])])
+    LIBS="${LIBS} -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl"
+    AC_DEFINE([HAVE_LAPACKE_MKL], [1], [Define to 1 if you have MKL LAPACKE support enabled.])
+else
+    AC_CHECK_HEADERS([lapacke.h], [LAPACKE_FOUND_HEADERS="yes"])
+    if test "x${LAPACKE_FOUND_HEADERS}" = "xyes"
+    then
+        AC_SEARCH_LIBS([LAPACKE_dgels], [lapacke],
+                       [LAPACKE_FOUND_LIBS="yes"], [LAPACKE_FOUND_LIBS="no"],
+                       [-llapack])
+        AS_IF([test "x${LAPACKE_FOUND_LIBS}" != "xyes"],
+              [AC_MSG_ERROR([Unable to find LAPACKE library.])])
+        LIBS="${LIBS} -llapack"
+        AC_DEFINE([HAVE_LAPACKE], [1], [Define to 1 if you have LAPACKE support enabled.])
+    else
+        AC_MSG_WARN([
+  -----------------------------------------------
+   Unable to find LAPACKE on this system.
+   Disabling support for dependent methods.
+  -----------------------------------------------])
+    fi
+fi
+
+AC_LANG([C++])
+
 # Checks for programs.
 AC_PROG_CC([cc icc gcc])
 AC_PROG_CPP
diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index 0753ea1090ec9b2f4c60007047c86f7c4a91728e..ca67bea5e5fca41c44b2248600a5ba454b4b89a9 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -419,6 +419,23 @@ int Allocate_Matrix( sparse_matrix **pH, int cap, int m, MPI_Comm comm )
 
     return SUCCESS;
 }
+int Allocate_Matrix2( sparse_matrix **pH, int n, int cap, int m, MPI_Comm comm )
+{
+    sparse_matrix *H;
+
+    *pH = (sparse_matrix*)
+          smalloc( sizeof(sparse_matrix), "sparse_matrix", comm );
+    H = *pH;
+    H->n = n;
+    H->cap = cap;
+    H->m = m;
+    H->start = (int*) smalloc( sizeof(int) * cap, "matrix_start", comm );
+    H->end = (int*) smalloc( sizeof(int) * cap, "matrix_end", comm );
+    H->entries = (sparse_matrix_entry*)
+                 smalloc( sizeof(sparse_matrix_entry) * m, "matrix_entries", comm );
+
+    return SUCCESS;
+}
 
 
 void Deallocate_Matrix( sparse_matrix *H )
diff --git a/PuReMD/src/allocate.h b/PuReMD/src/allocate.h
index 271cb054d636f19d1d3beeea09546f0a22b1c380..e4e7a86a5ee7525572af5a4b8f37d6be50144821 100644
--- a/PuReMD/src/allocate.h
+++ b/PuReMD/src/allocate.h
@@ -26,18 +26,24 @@
 int PreAllocate_Space( reax_system*, control_params*, storage*, MPI_Comm );
 
 void reax_atom_Copy( reax_atom*, reax_atom* );
+
 int  Allocate_System( reax_system*, int, int, char* );
 
 int  Allocate_Workspace( reax_system*, control_params*, storage*,
                          int, int, MPI_Comm, char* );
 
 void Allocate_Grid( reax_system*, MPI_Comm );
+
 void Deallocate_Grid( grid* );
 
-int  Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
+int Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
 
 int Allocate_Matrix( sparse_matrix**, int, int, MPI_Comm );
 
+int Allocate_Matrix2( sparse_matrix**, int, int, int, MPI_Comm );
+
+void Deallocate_Matrix( sparse_matrix * );
+
 int Allocate_HBond_List( int, int, int*, int*, reax_list* );
 
 int Allocate_Bond_List( int, int*, reax_list* );
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 96c8397653e440d95feb25c9b074dc5b84de24d1..3014299478da41545f9bdd0da2efc5f4e379d7db 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -29,6 +29,19 @@
 #endif
 
 #if defined(PURE_REAX)
+void int_packer( void *dummy, mpi_out_data *out_buf )
+{
+    int i;
+    int *buf = (int*) dummy;
+    int *out = (int*) out_buf->out_atoms;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        out[i] = buf[ out_buf->index[i] ];
+    }
+}
+
+
 void real_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
@@ -63,7 +76,7 @@ void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 
 
 void Dist( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, dist_packer pack )
+        void *buf, MPI_Datatype type, int scale, dist_packer pack )
 {
     int d;
     mpi_out_data *out_bufs;
@@ -84,26 +97,26 @@ void Dist( reax_system* system, mpi_datatypes *mpi_data,
         nbr1 = &(system->my_nbrs[2 * d]);
         if ( nbr1->atoms_cnt )
             MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                       nbr1->rank, 2 * d + 1, comm, &req1 );
+                    nbr1->rank, 2 * d + 1, comm, &req1 );
 
         nbr2 = &(system->my_nbrs[2 * d + 1]);
         if ( nbr2->atoms_cnt )
             MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                       nbr2->rank, 2 * d, comm, &req2 );
+                    nbr2->rank, 2 * d, comm, &req2 );
 
         /* send both messages in dimension d */
         if ( out_bufs[2 * d].cnt )
         {
             pack( buf, out_bufs + (2 * d) );
             MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
-                      nbr1->rank, 2 * d, comm );
+                    nbr1->rank, 2 * d, comm );
         }
 
         if ( out_bufs[2 * d + 1].cnt )
         {
             pack( buf, out_bufs + (2 * d + 1) );
             MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+                    nbr2->rank, 2 * d + 1, comm );
         }
 
         if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 );
@@ -138,7 +151,7 @@ void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
         rvec_Add( buf[ out_buf->index[i] ], in[i] );
 #if defined(DEBUG)
         fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
-                 out_buf->cnt, i, out_buf->index[i] );
+                out_buf->cnt, i, out_buf->index[i] );
 #endif
     }
 }
@@ -159,7 +172,7 @@ void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
 
 
 void Coll( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
+        void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
 {
     int d;
     void *in1, *in2;
@@ -191,19 +204,19 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
         /* send both messages in dimension d */
         if ( nbr1->atoms_cnt )
             MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                      nbr1->rank, 2 * d, comm );
+                    nbr1->rank, 2 * d, comm );
 
         if ( nbr2->atoms_cnt )
             MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+                    nbr2->rank, 2 * d + 1, comm );
 
 #if defined(DEBUG)
         fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
-                 out_bufs[2 * d].cnt );
+                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
+                out_bufs[2 * d].cnt );
         fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
-                 out_bufs[2 * d + 1].cnt );
+                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
+                out_bufs[2 * d + 1].cnt );
 #endif
 
         if ( out_bufs[2 * d].cnt )
@@ -276,13 +289,13 @@ real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
 /*****************************************************************************/
 #if defined(TEST_FORCES)
 void Coll_ids_at_Master( reax_system *system, storage *workspace,
-                         mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int i;
     int *id_list;
 
     MPI_Gather( &system->n, 1, MPI_INT, workspace->rcounts, 1, MPI_INT,
-                MASTER_NODE, mpi_data->world );
+            MASTER_NODE, mpi_data->world );
 
     if ( system->my_rank == MASTER_NODE )
     {
@@ -296,8 +309,8 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
         id_list[i] = system->my_atoms[i].orig_id;
 
     MPI_Gatherv( id_list, system->n, MPI_INT,
-                 workspace->id_all, workspace->rcounts, workspace->displs,
-                 MPI_INT, MASTER_NODE, mpi_data->world );
+            workspace->id_all, workspace->rcounts, workspace->displs,
+            MPI_INT, MASTER_NODE, mpi_data->world );
 
     sfree( id_list, "id_list" );
 
@@ -312,11 +325,11 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
 
 
 void Coll_rvecs_at_Master( reax_system *system, storage *workspace,
-                           mpi_datatypes *mpi_data, rvec* v )
+        mpi_datatypes *mpi_data, rvec* v )
 {
     MPI_Gatherv( v, system->n, mpi_data->mpi_rvec,
-                 workspace->f_all, workspace->rcounts, workspace->displs,
-                 mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
+            workspace->f_all, workspace->rcounts, workspace->displs,
+            mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
 }
 
 #endif
diff --git a/PuReMD/src/basic_comm.h b/PuReMD/src/basic_comm.h
index b3d7a5222c786f8c71662547e3a36a77abf8fb92..0dcbada0cd10f0e9eda83e994613f2e248a2738d 100644
--- a/PuReMD/src/basic_comm.h
+++ b/PuReMD/src/basic_comm.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+void int_packer( void*, mpi_out_data* );
 void real_packer( void*, mpi_out_data* );
 void rvec_packer( void*, mpi_out_data* );
 void rvec2_packer( void*, mpi_out_data* );
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index da661aa4f42f2e3ccd893975cfd684876339edc8..ca3dedbe5a4df50a92824c3cfc24273b9cef7080 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -589,9 +589,9 @@ int  Init_Lists( reax_system *system, control_params *control,
     workspace->U = NULL;
     
     //TODO: uncomment for SAI
-//    Allocate_Matrix( &(workspace->H_spar_patt), workspace->H->n, workspace->H->m );
+    Allocate_Matrix2( &(workspace->H_spar_patt), workspace->H->n, system->local_cap, workspace->H->m, comm );
 //    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
-//    Allocate_Matrix( &(workspace->H_app_inv), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
+    Allocate_Matrix2( &(workspace->H_app_inv), workspace->H->n, system->local_cap, workspace->H->m, comm );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 1aaa18dffa127e7589c3bd8253aba478c3f1e277..df14d35133beba22d61ed5323cb4678134815753 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -24,11 +24,827 @@
 #include "io_tools.h"
 #include "tool_box.h"
 #include "vector.h"
+#include "allocate.h"
+
+/* Intel MKL */
+#if defined(HAVE_LAPACKE_MKL)
+#include "mkl.h"
+/* reference LAPACK */
+#elif defined(HAVE_LAPACKE)
+#include "lapacke.h"
+#endif
 
 #if defined(CG_PERFORMANCE)
 real t_start, t_elapsed, matvec_time, dot_time;
 #endif
 
+int compare_dbls( const void* arg1, const void* arg2 )
+{   
+    int ret;
+    double a1, a2;
+
+    a1 = *(double *) arg1;
+    a2 = *(double *) arg2;
+
+    if ( a1 < a2 )
+    {   
+        ret = -1;
+    }
+    else if (a1 == a2)
+    {   
+        ret = 0;
+    }
+    else
+    {   
+        ret = 1;
+    }
+
+    return ret;
+}
+
+void qsort_dbls( double *array, int array_len )
+{
+    qsort( array, (size_t)array_len, sizeof(double),
+            compare_dbls );
+}
+
+int find_bucket( double *list, int len, double a )
+{
+    int s, e, m;
+
+    if( len == 0 )
+    {
+        return 0;
+    }
+
+    if ( a > list[len - 1] )
+    {
+        return len;
+    }
+
+    s = 0;
+    e = len - 1;
+
+    while ( s < e )
+    {
+        m = (s + e) / 2;
+
+        if ( list[m] < a )
+        {
+            s = m + 1;
+        }
+        else
+        {
+            e = m;
+        }
+    }
+
+    return s;
+}
+
+void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
+        mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt,
+        int nprocs, double filter )
+{
+    /* Print_Sparse_Matrix2( system, A, "Charge_Matrix_MPI_Step0.txt" ); */
+
+    int i, bin, total, pos, push;
+    int n, n_gather, m, s_local, s, n_local;
+    int target_proc;
+    int k;
+    int pj, size;
+    int left, right, p, turn;
+
+    real threshold, pivot, tmp;
+    real *input_array;
+    real *samplelist_local, *samplelist;
+    real *pivotlist;
+    real *bucketlist_local, *bucketlist;
+    real *local_entries;
+
+    int *srecv, *sdispls;
+    int *scounts_local, *scounts;
+    int *dspls_local, *dspls;
+    int *bin_elements;
+
+    MPI_Comm comm;
+
+    srecv = NULL;
+    sdispls = NULL;
+    samplelist_local = NULL;
+    samplelist = NULL;
+    pivotlist = NULL;
+    input_array = NULL;
+    bucketlist_local = NULL;
+    bucketlist = NULL;
+    scounts_local = NULL;
+    scounts = NULL;
+    dspls_local = NULL;
+    dspls = NULL;
+    bin_elements = NULL;
+    local_entries = NULL;
+
+    comm = mpi_data->world;
+
+    if ( *A_spar_patt == NULL )
+    {
+        //TODO
+        Allocate_Matrix2(A_spar_patt, A->n, system->local_cap, A->m, comm );
+    }
+
+    else if ( (*A_spar_patt)->m < A->m )
+    {
+        //TODO
+        Deallocate_Matrix( *A_spar_patt );
+        Allocate_Matrix2( A_spar_patt, A->n, system->local_cap, A->m, comm );
+        //Reallocate_Matrix( A_spar_patt, A->n, A->n_max, A->m );
+    }
+
+    m = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        m += A->end[i] - A->start[i];
+    }
+    /* the sample ratio is 10% */
+    /*n_local = m/10; */
+    n_local = m;
+    s_local = (int) (12.0 * log2(n_local*nprocs));
+    MPI_Allreduce(&n_local, &n, 1, MPI_INT, MPI_SUM, comm);
+    MPI_Reduce(&s_local, &s, 1, MPI_INT, MPI_SUM, MASTER_NODE, comm);
+
+    /* count num. bin elements for each processor, uniform bin sizes */
+    input_array = malloc( sizeof(real) * n_local );
+    scounts_local = malloc( sizeof(int) * nprocs );
+    scounts = malloc( sizeof(int) * nprocs );
+    bin_elements = malloc( sizeof(int) * nprocs );
+    dspls_local = malloc( sizeof(int) * nprocs );
+    bucketlist_local = malloc( sizeof(real) * n_local  );
+    dspls = malloc( sizeof(int) * nprocs );
+    pivotlist = malloc( sizeof(real) *  (nprocs - 1) );
+    samplelist_local = malloc( sizeof(real) * s_local );
+    local_entries = malloc ( sizeof(real) * m );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        samplelist = malloc( sizeof(real) * s );
+        srecv = malloc( sizeof(int) * nprocs );
+        sdispls = malloc( sizeof(int) * nprocs );
+    }
+
+    push = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        for( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            local_entries[push++] = A->entries[pj].val;
+        }
+    }
+
+    srand( time(NULL) + system->my_rank );
+    for ( i = 0; i < n_local ; i++ )
+    {
+        /* input_array[i] = local_entries[rand( ) % m]; */
+        input_array[i] = local_entries[ i ];
+    }
+
+    for ( i = 0; i < s_local; i++)
+    {
+        /* samplelist_local[i] = input_array[rand( ) % n_local]; */
+        samplelist_local[i] = input_array[ i ];
+    }
+
+    /* gather samples at the root process */
+    MPI_Gather( &s_local, 1, MPI_INT, srecv, 1, MPI_INT, MASTER_NODE, comm );
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        sdispls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            sdispls[i + 1] = sdispls[i] + srecv[i];
+        }
+    }
+
+    MPI_Gatherv( samplelist_local, s_local, MPI_DOUBLE,
+            samplelist, srecv, sdispls, MPI_DOUBLE, MASTER_NODE, comm);
+
+    /* sort samples at the root process and select pivots */
+    if ( system->my_rank == MASTER_NODE )
+    {
+        qsort_dbls( samplelist, s );
+
+        for ( i = 1; i < nprocs; ++i )
+        {
+            pivotlist[i - 1] = samplelist[(i * s) / nprocs];
+        }
+    }
+
+    /* broadcast pivots */
+    MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm );
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        scounts_local[i] = 0;
+    }
+
+    for ( i = 0; i < n_local; ++i )
+    {
+        pos = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        scounts_local[pos]++;
+    }
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        bin_elements[i] = scounts_local[i];
+        scounts[i] = scounts_local[i];
+    }
+
+    /* compute displacements for MPI comm */
+    dspls_local[0] = 0;
+    for ( i = 0; i < nprocs - 1; ++i )
+    {
+        dspls_local[i + 1] = dspls_local[i] + scounts_local[i];
+    }
+
+    /* bin elements */
+    for ( i = 0; i < n_local; ++i )
+    {
+        bin = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin];
+        bucketlist_local[pos] = input_array[i];
+        bin_elements[bin]--;
+    }
+
+    /* determine counts for elements per process */
+    MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm );
+
+    /* find the target process */
+    target_proc = 0;
+    total = 0;
+    k = n*filter;
+    for(i = nprocs - 1; i >= 0; --i )
+    {
+        if( total + scounts[i] >= k )
+        {
+            /* global k becomes local k*/
+            k -= total;
+            target_proc = i;
+            break;
+        }
+        total += scounts[i];
+    }
+
+    n_gather = scounts[target_proc];
+    if( system->my_rank == target_proc )
+    {
+        bucketlist = malloc( sizeof( real ) * n_gather );
+    }
+
+    /* send local buckets to target processor for quickselect */
+    MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, comm );
+    if ( system->my_rank == target_proc )
+    {
+        dspls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            dspls[i + 1] = dspls[i] + scounts[i];
+        }
+    }
+
+    MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE,
+            bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm);
+
+    /* apply quick select algorithm at the target process */
+    if( system->my_rank == target_proc)
+    {
+        left = 0;
+        right = n_gather-1;
+
+        turn = 0;
+        while( k ) {
+
+            p  = left;
+            turn = 1 - turn;
+
+            /* alternating pivots in order to handle corner cases */
+            if( turn == 1)
+            {
+                pivot = bucketlist[right];
+            }
+            else
+            {
+                pivot = bucketlist[left];
+            }
+            for( i = left + 1 - turn; i <= right-turn; ++i )
+            {
+                if( bucketlist[i] > pivot )
+                {
+                    tmp = bucketlist[i];
+                    bucketlist[i] = bucketlist[p];
+                    bucketlist[p] = tmp;
+                    p++;
+                }
+            }
+            if(turn == 1)
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[right];
+                bucketlist[right] = tmp;
+            }
+            else
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[left];
+                bucketlist[left] = tmp;
+            }
+
+            if( p == k - 1)
+            {
+                threshold = bucketlist[p];
+                break;
+            }
+            else if( p > k - 1 )
+            {
+                right = p - 1;
+            }
+            else
+            {
+                left = p + 1;
+            }
+        }
+        /* comment out if ACKS2 and/or EE is not an option
+           if(threshold < 1.000000)
+           {
+           threshold = 1.000001;
+           } */
+    }
+
+    /*if(system->my_rank == target_proc)
+      fprintf(stdout,"threshold = %.15lf\n", threshold);*/
+    /*broadcast the filtering value*/
+    MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
+
+    int nnz = 0;
+    /*build entries of that pattern*/
+    for ( i = 0; i < A->n; ++i )
+    {
+        (*A_spar_patt)->start[i] = A->start[i];
+        size = A->start[i];
+
+        for ( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            if ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) )
+            {
+                (*A_spar_patt)->entries[size].val = A->entries[pj].val;
+                (*A_spar_patt)->entries[size].j = A->entries[pj].j;
+                size++;
+                nnz++;
+            }
+        }
+        (*A_spar_patt)->end[i] = size;
+    }
+
+    MPI_Allreduce( MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, comm );
+    if( system->my_rank == MASTER_NODE )
+    {
+        fprintf(stdout,"total nnz in all sparsity patterns = %d\nthreshold = %.15lf\n", nnz, threshold);
+        fflush(stdout);
+    }
+
+}
+
+void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatypes *mpi_data, 
+        sparse_matrix *A, sparse_matrix *A_spar_patt, sparse_matrix **A_app_inv )
+{
+    int N, M, d_i, d_j;
+    int i, k, pj, j_temp;
+    int local_pos, atom_pos, identity_pos;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *pos_x, *X;
+    real *e_j, *dense_matrix;
+
+    int cnt, scale;
+    reax_atom *atom;
+    int *row_nnz;
+    int **j_list;
+    real **val_list;
+
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2, req3, req4;
+    int flag1, flag2;
+    MPI_Status stat1, stat2, stat3, stat4;
+    const neighbor_proc *nbr1, *nbr2;
+    int *j_send, *j_recv1, *j_recv2;
+    real *val_send, *val_recv1, *val_recv2;
+
+    real start;
+
+    /* start = Get_Time( ); */
+
+    comm = mpi_data->world;
+
+    if ( *A_app_inv == NULL)
+    {
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, comm );
+    }
+
+    else if ( (*A_app_inv)->m < A_spar_patt->m )
+    {
+        Deallocate_Matrix( *A_app_inv );
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, comm );
+    }
+
+    pos_x = NULL;
+    X = NULL;
+
+    row_nnz = NULL;
+    j_list = NULL;
+    val_list = NULL;
+
+    j_send = NULL;
+    val_send = NULL;
+    j_recv1 = NULL;
+    j_recv2 = NULL;
+    val_recv1 = NULL;
+    val_recv2 = NULL;
+
+    row_nnz = (int *) malloc( sizeof(int) * system->total_cap );
+
+    j_list = (int **) malloc( sizeof(int *) * system->N );
+    val_list = (real **) malloc( sizeof(real *) * system->N );
+
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        row_nnz[i] = 0;
+    }
+
+    /* mark the atoms that already have their row stored in the local matrix */
+    for ( i = 0; i < system->n; ++i )
+    {
+        row_nnz[i] = A->end[i] - A->start[i];
+    }
+
+    /* Announce the nnz's in each row that will be communicated later */
+    scale = sizeof(int) / sizeof(void);
+    Dist( system, mpi_data, row_nnz, MPI_INT, scale, int_packer );
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+
+    /*  use a Dist-like approach to send the row information */
+    for ( d = 0; d < 3; ++d)
+    {
+        flag1 = 0;
+        flag2 = 0;
+        cnt = 0;
+
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+        if ( nbr1->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                if( i >= A->n)
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag1 = 1;
+                j_recv1 = (int *) malloc( sizeof(int) * cnt );
+                val_recv1 = (real *) malloc( sizeof(real) * cnt );
+                MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
+                MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );
+            }
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+        if ( nbr1->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag2 = 1;
+                j_recv2 = (int *) malloc( sizeof(int) * cnt );
+                val_recv2 = (real *) malloc( sizeof(real) * cnt );
+                MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
+                MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );
+            }
+        }
+
+        /* send both messages in dimension d */
+        if( out_bufs[2 * d].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d].index[i] ];
+            }
+
+            if( cnt )
+            {
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+                {
+                    if( out_bufs[2 * d].index[i] < A->n )
+                    {
+                        for( pj = A->start[ out_bufs[2 * d].index[i] ]; pj < A->end[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
+            }
+        }
+
+        if( out_bufs[2 * d + 1].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d + 1].index[i] ];
+            }
+
+            if( cnt )
+            {
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+                {
+                    if( out_bufs[2 * d + 1].index[i] < A->n )
+                    {
+                        for( pj = A->start[ out_bufs[2 * d + 1].index[i] ]; pj < A->end[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm );
+            }
+
+        }
+
+        if( flag1 )
+        {
+            MPI_Wait( &req1, &stat1 );
+            MPI_Wait( &req2, &stat2 );
+
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv1[cnt];
+                        val_list[i][pj] = val_recv1[cnt];
+                        cnt++;
+                    }
+                }
+            }
+        }
+
+        if( flag2 )
+        {
+            MPI_Wait( &req3, &stat3 );
+            MPI_Wait( &req4, &stat4 );
+
+
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv2[cnt];
+                        val_list[i][pj] = val_recv2[cnt];
+                        cnt++;
+                    }
+                }
+            }
+        }
+    }
+
+    X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+    pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+
+    for ( i = 0; i < A_spar_patt->n; ++i )
+    {
+        N = 0;
+        M = 0;
+        for ( k = 0; k <= system->bigN; ++k )
+        {
+            X[k] = 0;
+            pos_x[k] = 0;
+        }
+
+        /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+            j_temp = A_spar_patt->entries[pj].j;
+            atom = &system->my_atoms[j_temp];
+            ++N;
+
+            /* for each of those indices
+             *              * search through the row of full A of that index */
+
+            /* the case where the local matrix has that index's row */
+            if( j_temp < A->n )
+            {
+                for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    atom = &system->my_atoms[ A->entries[k].j ];
+                    X[atom->orig_id] = 1;
+                }
+            }
+
+            /* the case where we communicated that index's row */
+            else
+            {
+                for ( k = 0; k < row_nnz[j_temp]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[ j_list[j_temp][k] ] = 1;
+                }
+            }
+        }
+
+        /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
+        identity_pos = M;
+        atom = &system->my_atoms[ i ];
+        atom_pos = atom->orig_id;
+
+        for ( k = 0; k <= system->bigN; k++)
+        {
+            if ( X[k] != 0 )
+            {
+                pos_x[k] = M;
+                if ( k == atom_pos )
+                {
+                    identity_pos = M;
+                }
+                ++M;
+            }
+        }
+
+        /* allocate memory for NxM dense matrix */
+        dense_matrix = (real *) malloc( sizeof(real) * N * M );
+
+        /* fill in the entries of dense matrix */
+        for ( d_j = 0; d_j < N; ++d_j)
+        {
+            /* all rows are initialized to zero */
+            for ( d_i = 0; d_i < M; ++d_i )
+            {
+                dense_matrix[d_i * N + d_j] = 0.0;
+            }
+            /* change the value if any of the column indices is seen */
+
+            /* it is in the original list */
+            local_pos = A_spar_patt->entries[ A_spar_patt->start[i] + d_j ].j;
+            if( local_pos < 0 || local_pos >= system->N )
+            {
+                fprintf( stdout, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n");
+                fflush( stdout );
+            }
+            if( local_pos < A->n )
+            {
+                for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i )
+                {
+                    atom = &system->my_atoms[ A->entries[d_i].j ];
+                    if (pos_x[ atom->orig_id ] >= M || d_j >=  N )
+                    {
+                        fprintf( stdout, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, orig_id = %d, i =  %d, j = %d, M = %d N = %d\n", atom->orig_id, pos_x[ atom->orig_id ], d_j, M, N );
+                        fflush( stdout );
+                    }
+                    if ( X[ atom->orig_id ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ atom->orig_id ] * N + d_j ] = A->entries[d_i].val;
+                    }
+                }
+            }
+            else
+            {
+                for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i )
+                {
+                    if (pos_x[ j_list[local_pos][d_i] ] >= M || d_j  >= N )
+                    {
+                        fprintf( stdout, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, %d %d\n", pos_x[ j_list[local_pos][d_i] ], d_j);
+                        fflush( stdout );
+                    }
+                    if ( X[ j_list[local_pos][d_i] ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i];
+                    }
+                }
+            }
+        }
+
+        /* create the right hand side of the linear equation
+         *          * that is the full column of the identity matrix */
+        e_j = (real *) malloc( sizeof(real) * M );
+
+        for ( k = 0; k < M; ++k )
+        {
+            e_j[k] = 0.0;
+        }
+        e_j[identity_pos] = 1.0;
+
+        /* Solve the overdetermined system AX = B through the least-squares problem:
+         *          *          * min ||B - AX||_2 */
+        m = M;
+        n = N;
+        nrhs = 1;
+        lda = N;
+        ldb = nrhs;
+
+        info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                e_j, ldb );
+
+        /* Check for the full rank */
+        if ( info > 0 )
+        {
+            fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
+            fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+            fprintf( stderr, "the least squares solution could not be computed.\n" );
+            exit( INVALID_INPUT );
+        }
+
+        /* accumulate the resulting vector to build A_app_inv */
+        (*A_app_inv)->start[i] = A_spar_patt->start[i];
+        (*A_app_inv)->end[i] = A_spar_patt->end[i];
+        for ( k = (*A_app_inv)->start[i]; k < (*A_app_inv)->end[i]; ++k)
+        {
+            (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j;
+            (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]];
+        }
+        free( dense_matrix );
+        free( e_j );
+    }
+
+    free( pos_x);
+    free( X );
+
+    /* return Get_Timing_Info( start ); */
+}
 
 void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 {
@@ -52,23 +868,23 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
 
-            b[i][0] += H * x[j][0];
-            b[i][1] += H * x[j][1];
+                b[i][0] += H * x[j][0];
+                b[i][1] += H * x[j][1];
 
 #if defined(HALF_LIST)
-            // comment out for tryQEq
-            //if( j < A->n ) {
-            b[j][0] += H * x[i][0];
-            b[j][1] += H * x[i][1];
-            //}
+                // comment out for tryQEq
+                //if( j < A->n ) {
+                b[j][0] += H * x[i][0];
+                b[j][1] += H * x[i][1];
+                //}
 #endif
-        }
+            }
     }
 }
 
@@ -285,18 +1101,18 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            
-            b[i] += H * x[j];
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
+
+                b[i] += H * x[j];
 #if defined(HALF_LIST)
-            //if( j < A->n ) // comment out for tryQEq
-            b[j] += H * x[i];
+                //if( j < A->n ) // comment out for tryQEq
+                b[j] += H * x[i];
 #endif
-        }
+            }
     }
 }
 
@@ -307,23 +1123,23 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec_full( const sparse_matrix * const A,
-                                const real * const x, real * const b )
+        const real * const x, real * const b )
 {
     //TODO: implement full SpMV in MPI
-//    int i, pj;
-//
-//    Vector_MakeZero( b, A->n );
-//
-//#ifdef _OPENMP
-//    #pragma omp for schedule(static)
-//#endif
-//    for ( i = 0; i < A->n; ++i )
-//    {
-//        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-//        {
-//            b[i] += A->val[pj] * x[A->j[pj]];
-//        }
-//    }
+    //    int i, pj;
+    //
+    //    Vector_MakeZero( b, A->n );
+    //
+    //#ifdef _OPENMP
+    //    #pragma omp for schedule(static)
+    //#endif
+    //    for ( i = 0; i < A->n; ++i )
+    //    {
+    //        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+    //        {
+    //            b[i] += A->val[pj] * x[A->j[pj]];
+    //        }
+    //    }
 }
 
 
@@ -364,7 +1180,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
     }
     //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//    Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
+    //    Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
 
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
@@ -404,7 +1220,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
         }
         //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//        Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
+        //        Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
 
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
@@ -465,14 +1281,14 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
 
     sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
-                            mpi_data->world );
+            mpi_data->world );
     sig0 = sig_new;
 #if defined(DEBUG)
     //if( system->my_rank == MASTER_NODE ) {
     fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
-             system->my_rank, sqrt(sig_new),
-             Parallel_Norm(workspace->d, system->n, mpi_data->world),
-             Parallel_Norm(workspace->q, system->n, mpi_data->world) );
+            system->my_rank, sqrt(sig_new),
+            Parallel_Norm(workspace->d, system->n, mpi_data->world),
+            Parallel_Norm(workspace->q, system->n, mpi_data->world) );
     //Vector_Print( stderr, "d", workspace->d, system->N );
     //Vector_Print( stderr, "q", workspace->q, system->N );
     //}
@@ -606,9 +1422,9 @@ void Backward_Subs( sparse_matrix *U, real *y, real *x )
 
 
 int PCG( reax_system *system, storage *workspace,
-         sparse_matrix *H, real *b, real tol,
-         sparse_matrix *L, sparse_matrix *U, real *x,
-         mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol,
+        sparse_matrix *L, sparse_matrix *U, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, me, n, N, scale;
     real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
@@ -642,8 +1458,8 @@ int PCG( reax_system *system, storage *workspace,
     {
         fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
         fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
-                 Parallel_Norm(workspace->d, n, world),
-                 Parallel_Norm(workspace->q, n, world) );
+                Parallel_Norm(workspace->d, n, world),
+                Parallel_Norm(workspace->q, n, world) );
     }
     MPI_Barrier( world );
 #endif
@@ -694,7 +1510,7 @@ int PCG( reax_system *system, storage *workspace,
 
 #if defined(OLD_STUFF)
 int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
-         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j;
     real tmp, alpha, beta, b_norm;
@@ -787,7 +1603,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -808,10 +1624,10 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
 
         Vector_Sum( workspace->v[0],
-                    1.,  workspace->b_prc, -1., workspace->b_prm, N );
+                1.,  workspace->b_prc, -1., workspace->b_prm, N );
         workspace->g[0] = Norm( workspace->v[0], N );
         Vector_Scale( workspace->v[0],
-                      1. / workspace->g[0], workspace->v[0], N );
+                1. / workspace->g[0], workspace->v[0], N );
 
         // fprintf( stderr, "%10.6f\n", workspace->g[0] );
 
@@ -830,12 +1646,12 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             {
                 workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
                 Vector_Add( workspace->v[j + 1],
-                            -workspace->h[i][j], workspace->v[i], N );
+                        -workspace->h[i][j], workspace->v[i], N );
             }
 
             workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
             Vector_Scale( workspace->v[j + 1],
-                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
+                    1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
             // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
 
             /* Givens rotations on the H matrix to make it U */
@@ -849,9 +1665,9 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
                 }
 
                 tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                        workspace->hs[i] * workspace->h[i + 1][j];
+                    workspace->hs[i] * workspace->h[i + 1][j];
                 tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                       workspace->hc[i] * workspace->h[i + 1][j];
+                    workspace->hc[i] * workspace->h[i + 1][j];
 
                 workspace->h[i][j] = tmp1;
                 workspace->h[i + 1][j] = tmp2;
@@ -895,7 +1711,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
       workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
     fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+            itr, j, fabs( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
@@ -908,8 +1724,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES_HouseHolder( reax_system *system, storage *workspace,
-                       sparse_matrix *H, real *b, real tol, real *x,
-                       mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1055,7 +1871,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
     //          workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
     fprintf( fout, "GMRES outer:%d  inner:%d iters, |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+            itr, j, fabs( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 87c2f0ade19586169029b566aa8871d5a0794a77..292a02ca5213ce21b0d6f11a5cf463eb6e3cd53e 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -24,6 +24,12 @@
 
 #include "reax_types.h"
 
+void setup_sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
+        sparse_matrix *, sparse_matrix **, int, double );
+
+void sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
+        sparse_matrix*, sparse_matrix*, sparse_matrix** );
+
 int GMRES( reax_system*, storage*, sparse_matrix*,
            real*, real, real*, mpi_datatypes*, FILE* );
 int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index c8a348c7f5898ab061837fc02f2543793c611d78..a725a2d8d85ac27962fb69dcbb3c404dec2ab31a 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -41,7 +41,7 @@ void Sort_Matrix_Rows( sparse_matrix *A )
         si = A->start[i];
         ei = A->end[i];
         qsort( &(A->entries[si]), ei - si,
-               sizeof(sparse_matrix_entry), compare_matrix_entry );
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
     }
 }
 
@@ -103,7 +103,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
 
 
 void ICHOLT( sparse_matrix *A, real *droptol,
-             sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
     sparse_matrix_entry tmp[1000];
     int i, j, pj, k1, k2, tmptop, Utop;
@@ -231,89 +231,89 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-                  control_params *control,  storage *workspace,
-                  mpi_datatypes *mpi_data )
+        control_params *control,  storage *workspace,
+        mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
 
     /*if( (data->step - data->prev_steps) % control->refactor == 0 ||
-        workspace->L == NULL ) {
-      //Print_Linear_System( system, control, workspace, data->step );
-      Sort_Matrix_Rows( workspace->H );
-      fprintf( stderr, "H matrix sorted\n" );
-      Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
-      fprintf( stderr, "drop tolerances calculated\n" );
-      if( workspace->L == NULL ) {
-        fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
-
-        if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
-      Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
+      workspace->L == NULL ) {
+    //Print_Linear_System( system, control, workspace, data->step );
+    Sort_Matrix_Rows( workspace->H );
+    fprintf( stderr, "H matrix sorted\n" );
+    Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
+    fprintf( stderr, "drop tolerances calculated\n" );
+    if( workspace->L == NULL ) {
+    fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
+
+    if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
+    Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
     fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
     MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
-        }
+    }
 
-        workspace->L->n = workspace->H->n;
-        workspace->U->n = workspace->H->n;
-    #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: n=%d, fillin = %d\n",
-           system->my_rank, workspace->L->n, fillin );
-        fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
-                 system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
-    #endif
-      }
-
-      ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
-    #if defined(DEBUG_FOCUS)
-      fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
-      //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-      //Print_Sparse_Matrix2( workspace->L, fname );
-      //Print_Sparse_Matrix( U );
-    #endif
-    }*/
+    workspace->L->n = workspace->H->n;
+    workspace->U->n = workspace->H->n;
+#if defined(DEBUG_FOCUS)
+fprintf( stderr, "p%d: n=%d, fillin = %d\n",
+system->my_rank, workspace->L->n, fillin );
+fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
+system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
+#endif
+}
+
+ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+#if defined(DEBUG_FOCUS)
+fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
+    //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+    //Print_Sparse_Matrix2( workspace->L, fname );
+    //Print_Sparse_Matrix( U );
+#endif
+}*/
 
     //TODO: fill in code for setting up and computing SAI, see sPuReMD code,
     //  and remove diagonal preconditioner computation below (workspace->Hdia_inv)
-//    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//            &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//            control->cm_solver_pre_comp_sai_thres );
+    //    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+    //            &workspace->H_spar_patt_full, &workspace->H_app_inv,
+    //            control->cm_solver_pre_comp_sai_thres );
 
     for ( i = 0; i < system->n; ++i )
-    {
-        atom = &( system->my_atoms[i] );
-
-        /* init pre-conditioner for H and init solution vectors */
-        workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
-        workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b_t[i] = -1.0;
-        workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b[i][1] = -1.0;
-
-        /* linear extrapolation for s and for t */
-        // newQEq: no extrapolation!
-        //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
-        //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
-        //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
-        //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
-
-        /* quadratic extrapolation for s and t */
-        // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-        // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-        //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-        workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-
-        /* cubic extrapolation for s and t */
-        workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
-        //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
-
-        // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
-    }
+{
+    atom = &( system->my_atoms[i] );
+
+    /* init pre-conditioner for H and init solution vectors */
+    workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
+    workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+    workspace->b_t[i] = -1.0;
+    workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+    workspace->b[i][1] = -1.0;
+
+    /* linear extrapolation for s and for t */
+    // newQEq: no extrapolation!
+    //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
+    //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
+    //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
+    //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
+
+    /* quadratic extrapolation for s and t */
+    // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+    // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+    //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+
+    /* cubic extrapolation for s and t */
+    workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
+    //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
+
+    // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
+}
 }
 
 
 
 void Calculate_Charges( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int        i, scale;
     real       u;//, s_sum, t_sum;
@@ -365,17 +365,37 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 }
 
 
+static void Setup_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    Sort_Matrix_Rows( workspace->H );
+
+    //TODO: add sai filtering value, which will be passed as the last parameter
+    setup_sparse_approx_inverse( system, workspace, mpi_data, workspace->H, &workspace->H_spar_patt, 
+            control->nprocs, 0.1 );
+}
+
+static void Compute_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+    sparse_approx_inverse( system, workspace, mpi_data,
+            workspace->H, workspace->H_spar_patt, &workspace->H_app_inv );
+#else
+    fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+    exit( INVALID_INPUT );
+#endif
+}
+
 void QEq( reax_system *system, control_params *control, simulation_data *data,
-          storage *workspace, output_controls *out_control,
-          mpi_datatypes *mpi_data )
+        storage *workspace, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int j, s_matvecs, t_matvecs;
 
     Init_MatVec( system, data, control, workspace, mpi_data );
 
-    //if( data->step == 50010 ) {
-    //  Print_Linear_System( system, control, workspace, data->step );
-    // }
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
     //Print_Linear_System( system, control, workspace, data->step );
@@ -385,10 +405,20 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //          control->q_err, workspace->x, mpi_data, out_control->log);
     //t_matvecs = 0;
 
+#if defined(SAI_PRECONDITIONER)
+    if( control->refactor > 0 && ((data->step - data->prev_steps) % control->refactor == 0))
+    {
+        Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+
+        Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+    }
+#endif
+
+
     for ( j = 0; j < system->n; ++j )
         workspace->s[j] = workspace->x[j][0];
     s_matvecs = CG(system, workspace, workspace->H, workspace->b_s,//newQEq sCG
-                   control->q_err, workspace->s, mpi_data, out_control->log );
+            control->q_err, workspace->s, mpi_data, out_control->log );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][0] = workspace->s[j];
 
@@ -402,7 +432,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     for ( j = 0; j < system->n; ++j )
         workspace->t[j] = workspace->x[j][1];
     t_matvecs = CG(system, workspace, workspace->H, workspace->b_t,//newQEq sCG
-                   control->q_err, workspace->t, mpi_data, out_control->log );
+            control->q_err, workspace->t, mpi_data, out_control->log );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][1] = workspace->t[j];
 
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 0e58175e24aae6f9962e7f00529d7302ee9387d4..81b55c64091e54aa1d5ed1f92b1396d3379fdfbc 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -36,10 +36,10 @@
 
 #define PURE_REAX
 //#define LAMMPS_REAX
-
+#define SAI_PRECONDITIONER
+//#define HALF_LIST
 //#define DEBUG
 //#define DEBUG_FOCUS
-//#define HALF_LIST
 //#define TEST_ENERGY
 //#define TEST_FORCES
 //#define CG_PERFORMANCE