From 429f9c06d37e16a30a5c936d93536208100fb240 Mon Sep 17 00:00:00 2001
From: Abdullah Alperen <alperena@msu.edu>
Date: Fri, 7 Sep 2018 22:54:45 -0400
Subject: [PATCH] PuReMD: add SAI setup and computation (LAPACKE is not linked)

---
 PuReMD/aclocal.m4           |  44 +-
 PuReMD/configure.ac         |  33 ++
 PuReMD/src/allocate.c       |  17 +
 PuReMD/src/allocate.h       |   8 +-
 PuReMD/src/basic_comm.c     |  53 +-
 PuReMD/src/basic_comm.h     |   1 +
 PuReMD/src/init_md.c        |   4 +-
 PuReMD/src/linear_solvers.c | 934 +++++++++++++++++++++++++++++++++---
 PuReMD/src/linear_solvers.h |   6 +
 PuReMD/src/qEq.c            | 178 ++++---
 PuReMD/src/reax_types.h     |   4 +-
 11 files changed, 1102 insertions(+), 180 deletions(-)

diff --git a/PuReMD/aclocal.m4 b/PuReMD/aclocal.m4
index 88932683..90f53ceb 100644
--- a/PuReMD/aclocal.m4
+++ b/PuReMD/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.15.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.15.1], [],
+m4_if([$1], [1.15], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.15.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -110,7 +110,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997-2017 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -141,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -332,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -408,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -605,7 +605,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -626,7 +626,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])
 
-# Copyright (C) 2003-2017 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -647,7 +647,7 @@ AC_SUBST([am__leading_dot])])
 
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -697,7 +697,7 @@ rm -f confinc confmf
 
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997-2017 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +736,7 @@ fi
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -765,7 +765,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
 
-# Copyright (C) 1999-2017 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -812,7 +812,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -831,7 +831,7 @@ AC_DEFUN([AM_RUN_LOG],
 
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
 
-# Copyright (C) 1996-2017 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -912,7 +912,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])
 
-# Copyright (C) 2009-2017 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -972,7 +972,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001-2017 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1000,7 +1000,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006-2017 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1019,7 +1019,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004-2017 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
diff --git a/PuReMD/configure.ac b/PuReMD/configure.ac
index c5d0b357..3df96c97 100644
--- a/PuReMD/configure.ac
+++ b/PuReMD/configure.ac
@@ -24,6 +24,39 @@ AC_DEFUN([CONFIGURE_HEADLINE],
         echo; echo "+++ $1 +++"
 ])
 
+# Check for LAPACKE
+AC_CHECK_HEADERS([mkl.h], [MKL_FOUND_HEADERS="yes"])
+if test "x${MKL_FOUND_HEADERS}" = "xyes"
+then
+    AC_SEARCH_LIBS([LAPACKE_dgels], [mkl_intel_ilp64],
+                   [MKL_FOUND_LIBS="yes"], [MKL_FOUND_LIBS="no"], 
+                   [-lmkl_sequential -lmkl_core -lpthread -lm -ldl])
+    AS_IF([test "x${MKL_FOUND_LIBS}" != "xyes"],
+          [AC_MSG_ERROR([Unable to find MKL LAPACKE library.])])
+    LIBS="${LIBS} -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl"
+    AC_DEFINE([HAVE_LAPACKE_MKL], [1], [Define to 1 if you have MKL LAPACKE support enabled.])
+else
+    AC_CHECK_HEADERS([lapacke.h], [LAPACKE_FOUND_HEADERS="yes"])
+    if test "x${LAPACKE_FOUND_HEADERS}" = "xyes"
+    then
+        AC_SEARCH_LIBS([LAPACKE_dgels], [lapacke],
+                       [LAPACKE_FOUND_LIBS="yes"], [LAPACKE_FOUND_LIBS="no"],
+                       [-llapack])
+        AS_IF([test "x${LAPACKE_FOUND_LIBS}" != "xyes"],
+              [AC_MSG_ERROR([Unable to find LAPACKE library.])])
+        LIBS="${LIBS} -llapack"
+        AC_DEFINE([HAVE_LAPACKE], [1], [Define to 1 if you have LAPACKE support enabled.])
+    else
+        AC_MSG_WARN([
+  -----------------------------------------------
+   Unable to find LAPACKE on this system.
+   Disabling support for dependent methods.
+  -----------------------------------------------])
+    fi
+fi
+
+AC_LANG([C++])
+
 # Checks for programs.
 AC_PROG_CC([cc icc gcc])
 AC_PROG_CPP
diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index 0753ea10..ca67bea5 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -419,6 +419,23 @@ int Allocate_Matrix( sparse_matrix **pH, int cap, int m, MPI_Comm comm )
 
     return SUCCESS;
 }
+int Allocate_Matrix2( sparse_matrix **pH, int n, int cap, int m, MPI_Comm comm )
+{
+    sparse_matrix *H;
+
+    *pH = (sparse_matrix*)
+          smalloc( sizeof(sparse_matrix), "sparse_matrix", comm );
+    H = *pH;
+    H->n = n;
+    H->cap = cap;
+    H->m = m;
+    H->start = (int*) smalloc( sizeof(int) * cap, "matrix_start", comm );
+    H->end = (int*) smalloc( sizeof(int) * cap, "matrix_end", comm );
+    H->entries = (sparse_matrix_entry*)
+                 smalloc( sizeof(sparse_matrix_entry) * m, "matrix_entries", comm );
+
+    return SUCCESS;
+}
 
 
 void Deallocate_Matrix( sparse_matrix *H )
diff --git a/PuReMD/src/allocate.h b/PuReMD/src/allocate.h
index 271cb054..e4e7a86a 100644
--- a/PuReMD/src/allocate.h
+++ b/PuReMD/src/allocate.h
@@ -26,18 +26,24 @@
 int PreAllocate_Space( reax_system*, control_params*, storage*, MPI_Comm );
 
 void reax_atom_Copy( reax_atom*, reax_atom* );
+
 int  Allocate_System( reax_system*, int, int, char* );
 
 int  Allocate_Workspace( reax_system*, control_params*, storage*,
                          int, int, MPI_Comm, char* );
 
 void Allocate_Grid( reax_system*, MPI_Comm );
+
 void Deallocate_Grid( grid* );
 
-int  Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
+int Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
 
 int Allocate_Matrix( sparse_matrix**, int, int, MPI_Comm );
 
+int Allocate_Matrix2( sparse_matrix**, int, int, int, MPI_Comm );
+
+void Deallocate_Matrix( sparse_matrix * );
+
 int Allocate_HBond_List( int, int, int*, int*, reax_list* );
 
 int Allocate_Bond_List( int, int*, reax_list* );
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 96c83976..30142994 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -29,6 +29,19 @@
 #endif
 
 #if defined(PURE_REAX)
+void int_packer( void *dummy, mpi_out_data *out_buf )
+{
+    int i;
+    int *buf = (int*) dummy;
+    int *out = (int*) out_buf->out_atoms;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        out[i] = buf[ out_buf->index[i] ];
+    }
+}
+
+
 void real_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
@@ -63,7 +76,7 @@ void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 
 
 void Dist( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, dist_packer pack )
+        void *buf, MPI_Datatype type, int scale, dist_packer pack )
 {
     int d;
     mpi_out_data *out_bufs;
@@ -84,26 +97,26 @@ void Dist( reax_system* system, mpi_datatypes *mpi_data,
         nbr1 = &(system->my_nbrs[2 * d]);
         if ( nbr1->atoms_cnt )
             MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                       nbr1->rank, 2 * d + 1, comm, &req1 );
+                    nbr1->rank, 2 * d + 1, comm, &req1 );
 
         nbr2 = &(system->my_nbrs[2 * d + 1]);
         if ( nbr2->atoms_cnt )
             MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                       nbr2->rank, 2 * d, comm, &req2 );
+                    nbr2->rank, 2 * d, comm, &req2 );
 
         /* send both messages in dimension d */
         if ( out_bufs[2 * d].cnt )
         {
             pack( buf, out_bufs + (2 * d) );
             MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
-                      nbr1->rank, 2 * d, comm );
+                    nbr1->rank, 2 * d, comm );
         }
 
         if ( out_bufs[2 * d + 1].cnt )
         {
             pack( buf, out_bufs + (2 * d + 1) );
             MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+                    nbr2->rank, 2 * d + 1, comm );
         }
 
         if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 );
@@ -138,7 +151,7 @@ void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
         rvec_Add( buf[ out_buf->index[i] ], in[i] );
 #if defined(DEBUG)
         fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
-                 out_buf->cnt, i, out_buf->index[i] );
+                out_buf->cnt, i, out_buf->index[i] );
 #endif
     }
 }
@@ -159,7 +172,7 @@ void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
 
 
 void Coll( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
+        void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
 {
     int d;
     void *in1, *in2;
@@ -191,19 +204,19 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
         /* send both messages in dimension d */
         if ( nbr1->atoms_cnt )
             MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                      nbr1->rank, 2 * d, comm );
+                    nbr1->rank, 2 * d, comm );
 
         if ( nbr2->atoms_cnt )
             MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+                    nbr2->rank, 2 * d + 1, comm );
 
 #if defined(DEBUG)
         fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
-                 out_bufs[2 * d].cnt );
+                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
+                out_bufs[2 * d].cnt );
         fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
-                 out_bufs[2 * d + 1].cnt );
+                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
+                out_bufs[2 * d + 1].cnt );
 #endif
 
         if ( out_bufs[2 * d].cnt )
@@ -276,13 +289,13 @@ real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
 /*****************************************************************************/
 #if defined(TEST_FORCES)
 void Coll_ids_at_Master( reax_system *system, storage *workspace,
-                         mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int i;
     int *id_list;
 
     MPI_Gather( &system->n, 1, MPI_INT, workspace->rcounts, 1, MPI_INT,
-                MASTER_NODE, mpi_data->world );
+            MASTER_NODE, mpi_data->world );
 
     if ( system->my_rank == MASTER_NODE )
     {
@@ -296,8 +309,8 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
         id_list[i] = system->my_atoms[i].orig_id;
 
     MPI_Gatherv( id_list, system->n, MPI_INT,
-                 workspace->id_all, workspace->rcounts, workspace->displs,
-                 MPI_INT, MASTER_NODE, mpi_data->world );
+            workspace->id_all, workspace->rcounts, workspace->displs,
+            MPI_INT, MASTER_NODE, mpi_data->world );
 
     sfree( id_list, "id_list" );
 
@@ -312,11 +325,11 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
 
 
 void Coll_rvecs_at_Master( reax_system *system, storage *workspace,
-                           mpi_datatypes *mpi_data, rvec* v )
+        mpi_datatypes *mpi_data, rvec* v )
 {
     MPI_Gatherv( v, system->n, mpi_data->mpi_rvec,
-                 workspace->f_all, workspace->rcounts, workspace->displs,
-                 mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
+            workspace->f_all, workspace->rcounts, workspace->displs,
+            mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
 }
 
 #endif
diff --git a/PuReMD/src/basic_comm.h b/PuReMD/src/basic_comm.h
index b3d7a522..0dcbada0 100644
--- a/PuReMD/src/basic_comm.h
+++ b/PuReMD/src/basic_comm.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+void int_packer( void*, mpi_out_data* );
 void real_packer( void*, mpi_out_data* );
 void rvec_packer( void*, mpi_out_data* );
 void rvec2_packer( void*, mpi_out_data* );
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index da661aa4..ca3dedbe 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -589,9 +589,9 @@ int  Init_Lists( reax_system *system, control_params *control,
     workspace->U = NULL;
     
     //TODO: uncomment for SAI
-//    Allocate_Matrix( &(workspace->H_spar_patt), workspace->H->n, workspace->H->m );
+    Allocate_Matrix2( &(workspace->H_spar_patt), workspace->H->n, system->local_cap, workspace->H->m, comm );
 //    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
-//    Allocate_Matrix( &(workspace->H_app_inv), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
+    Allocate_Matrix2( &(workspace->H_app_inv), workspace->H->n, system->local_cap, workspace->H->m, comm );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 1aaa18df..df14d351 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -24,11 +24,827 @@
 #include "io_tools.h"
 #include "tool_box.h"
 #include "vector.h"
+#include "allocate.h"
+
+/* Intel MKL */
+#if defined(HAVE_LAPACKE_MKL)
+#include "mkl.h"
+/* reference LAPACK */
+#elif defined(HAVE_LAPACKE)
+#include "lapacke.h"
+#endif
 
 #if defined(CG_PERFORMANCE)
 real t_start, t_elapsed, matvec_time, dot_time;
 #endif
 
+int compare_dbls( const void* arg1, const void* arg2 )
+{   
+    int ret;
+    double a1, a2;
+
+    a1 = *(double *) arg1;
+    a2 = *(double *) arg2;
+
+    if ( a1 < a2 )
+    {   
+        ret = -1;
+    }
+    else if (a1 == a2)
+    {   
+        ret = 0;
+    }
+    else
+    {   
+        ret = 1;
+    }
+
+    return ret;
+}
+
+void qsort_dbls( double *array, int array_len )
+{
+    qsort( array, (size_t)array_len, sizeof(double),
+            compare_dbls );
+}
+
+int find_bucket( double *list, int len, double a )
+{
+    int s, e, m;
+
+    if( len == 0 )
+    {
+        return 0;
+    }
+
+    if ( a > list[len - 1] )
+    {
+        return len;
+    }
+
+    s = 0;
+    e = len - 1;
+
+    while ( s < e )
+    {
+        m = (s + e) / 2;
+
+        if ( list[m] < a )
+        {
+            s = m + 1;
+        }
+        else
+        {
+            e = m;
+        }
+    }
+
+    return s;
+}
+
+void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
+        mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt,
+        int nprocs, double filter )
+{
+    /* Print_Sparse_Matrix2( system, A, "Charge_Matrix_MPI_Step0.txt" ); */
+
+    int i, bin, total, pos, push;
+    int n, n_gather, m, s_local, s, n_local;
+    int target_proc;
+    int k;
+    int pj, size;
+    int left, right, p, turn;
+
+    real threshold, pivot, tmp;
+    real *input_array;
+    real *samplelist_local, *samplelist;
+    real *pivotlist;
+    real *bucketlist_local, *bucketlist;
+    real *local_entries;
+
+    int *srecv, *sdispls;
+    int *scounts_local, *scounts;
+    int *dspls_local, *dspls;
+    int *bin_elements;
+
+    MPI_Comm comm;
+
+    srecv = NULL;
+    sdispls = NULL;
+    samplelist_local = NULL;
+    samplelist = NULL;
+    pivotlist = NULL;
+    input_array = NULL;
+    bucketlist_local = NULL;
+    bucketlist = NULL;
+    scounts_local = NULL;
+    scounts = NULL;
+    dspls_local = NULL;
+    dspls = NULL;
+    bin_elements = NULL;
+    local_entries = NULL;
+
+    comm = mpi_data->world;
+
+    if ( *A_spar_patt == NULL )
+    {
+        //TODO
+        Allocate_Matrix2(A_spar_patt, A->n, system->local_cap, A->m, comm );
+    }
+
+    else if ( (*A_spar_patt)->m < A->m )
+    {
+        //TODO
+        Deallocate_Matrix( *A_spar_patt );
+        Allocate_Matrix2( A_spar_patt, A->n, system->local_cap, A->m, comm );
+        //Reallocate_Matrix( A_spar_patt, A->n, A->n_max, A->m );
+    }
+
+    m = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        m += A->end[i] - A->start[i];
+    }
+    /* the sample ratio is 10% */
+    /*n_local = m/10; */
+    n_local = m;
+    s_local = (int) (12.0 * log2(n_local*nprocs));
+    MPI_Allreduce(&n_local, &n, 1, MPI_INT, MPI_SUM, comm);
+    MPI_Reduce(&s_local, &s, 1, MPI_INT, MPI_SUM, MASTER_NODE, comm);
+
+    /* count num. bin elements for each processor, uniform bin sizes */
+    input_array = malloc( sizeof(real) * n_local );
+    scounts_local = malloc( sizeof(int) * nprocs );
+    scounts = malloc( sizeof(int) * nprocs );
+    bin_elements = malloc( sizeof(int) * nprocs );
+    dspls_local = malloc( sizeof(int) * nprocs );
+    bucketlist_local = malloc( sizeof(real) * n_local  );
+    dspls = malloc( sizeof(int) * nprocs );
+    pivotlist = malloc( sizeof(real) *  (nprocs - 1) );
+    samplelist_local = malloc( sizeof(real) * s_local );
+    local_entries = malloc ( sizeof(real) * m );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        samplelist = malloc( sizeof(real) * s );
+        srecv = malloc( sizeof(int) * nprocs );
+        sdispls = malloc( sizeof(int) * nprocs );
+    }
+
+    push = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        for( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            local_entries[push++] = A->entries[pj].val;
+        }
+    }
+
+    srand( time(NULL) + system->my_rank );
+    for ( i = 0; i < n_local ; i++ )
+    {
+        /* input_array[i] = local_entries[rand( ) % m]; */
+        input_array[i] = local_entries[ i ];
+    }
+
+    for ( i = 0; i < s_local; i++)
+    {
+        /* samplelist_local[i] = input_array[rand( ) % n_local]; */
+        samplelist_local[i] = input_array[ i ];
+    }
+
+    /* gather samples at the root process */
+    MPI_Gather( &s_local, 1, MPI_INT, srecv, 1, MPI_INT, MASTER_NODE, comm );
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        sdispls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            sdispls[i + 1] = sdispls[i] + srecv[i];
+        }
+    }
+
+    MPI_Gatherv( samplelist_local, s_local, MPI_DOUBLE,
+            samplelist, srecv, sdispls, MPI_DOUBLE, MASTER_NODE, comm);
+
+    /* sort samples at the root process and select pivots */
+    if ( system->my_rank == MASTER_NODE )
+    {
+        qsort_dbls( samplelist, s );
+
+        for ( i = 1; i < nprocs; ++i )
+        {
+            pivotlist[i - 1] = samplelist[(i * s) / nprocs];
+        }
+    }
+
+    /* broadcast pivots */
+    MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm );
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        scounts_local[i] = 0;
+    }
+
+    for ( i = 0; i < n_local; ++i )
+    {
+        pos = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        scounts_local[pos]++;
+    }
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        bin_elements[i] = scounts_local[i];
+        scounts[i] = scounts_local[i];
+    }
+
+    /* compute displacements for MPI comm */
+    dspls_local[0] = 0;
+    for ( i = 0; i < nprocs - 1; ++i )
+    {
+        dspls_local[i + 1] = dspls_local[i] + scounts_local[i];
+    }
+
+    /* bin elements */
+    for ( i = 0; i < n_local; ++i )
+    {
+        bin = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin];
+        bucketlist_local[pos] = input_array[i];
+        bin_elements[bin]--;
+    }
+
+    /* determine counts for elements per process */
+    MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm );
+
+    /* find the target process */
+    target_proc = 0;
+    total = 0;
+    k = n*filter;
+    for(i = nprocs - 1; i >= 0; --i )
+    {
+        if( total + scounts[i] >= k )
+        {
+            /* global k becomes local k*/
+            k -= total;
+            target_proc = i;
+            break;
+        }
+        total += scounts[i];
+    }
+
+    n_gather = scounts[target_proc];
+    if( system->my_rank == target_proc )
+    {
+        bucketlist = malloc( sizeof( real ) * n_gather );
+    }
+
+    /* send local buckets to target processor for quickselect */
+    MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, comm );
+    if ( system->my_rank == target_proc )
+    {
+        dspls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            dspls[i + 1] = dspls[i] + scounts[i];
+        }
+    }
+
+    MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE,
+            bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm);
+
+    /* apply quick select algorithm at the target process */
+    if( system->my_rank == target_proc)
+    {
+        left = 0;
+        right = n_gather-1;
+
+        turn = 0;
+        while( k ) {
+
+            p  = left;
+            turn = 1 - turn;
+
+            /* alternating pivots in order to handle corner cases */
+            if( turn == 1)
+            {
+                pivot = bucketlist[right];
+            }
+            else
+            {
+                pivot = bucketlist[left];
+            }
+            for( i = left + 1 - turn; i <= right-turn; ++i )
+            {
+                if( bucketlist[i] > pivot )
+                {
+                    tmp = bucketlist[i];
+                    bucketlist[i] = bucketlist[p];
+                    bucketlist[p] = tmp;
+                    p++;
+                }
+            }
+            if(turn == 1)
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[right];
+                bucketlist[right] = tmp;
+            }
+            else
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[left];
+                bucketlist[left] = tmp;
+            }
+
+            if( p == k - 1)
+            {
+                threshold = bucketlist[p];
+                break;
+            }
+            else if( p > k - 1 )
+            {
+                right = p - 1;
+            }
+            else
+            {
+                left = p + 1;
+            }
+        }
+        /* comment out if ACKS2 and/or EE is not an option
+           if(threshold < 1.000000)
+           {
+           threshold = 1.000001;
+           } */
+    }
+
+    /*if(system->my_rank == target_proc)
+      fprintf(stdout,"threshold = %.15lf\n", threshold);*/
+    /*broadcast the filtering value*/
+    MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
+
+    int nnz = 0;
+    /*build entries of that pattern*/
+    for ( i = 0; i < A->n; ++i )
+    {
+        (*A_spar_patt)->start[i] = A->start[i];
+        size = A->start[i];
+
+        for ( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            if ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) )
+            {
+                (*A_spar_patt)->entries[size].val = A->entries[pj].val;
+                (*A_spar_patt)->entries[size].j = A->entries[pj].j;
+                size++;
+                nnz++;
+            }
+        }
+        (*A_spar_patt)->end[i] = size;
+    }
+
+    MPI_Allreduce( MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, comm );
+    if( system->my_rank == MASTER_NODE )
+    {
+        fprintf(stdout,"total nnz in all sparsity patterns = %d\nthreshold = %.15lf\n", nnz, threshold);
+        fflush(stdout);
+    }
+
+}
+
+void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatypes *mpi_data, 
+        sparse_matrix *A, sparse_matrix *A_spar_patt, sparse_matrix **A_app_inv )
+{
+    int N, M, d_i, d_j;
+    int i, k, pj, j_temp;
+    int local_pos, atom_pos, identity_pos;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *pos_x, *X;
+    real *e_j, *dense_matrix;
+
+    int cnt, scale;
+    reax_atom *atom;
+    int *row_nnz;
+    int **j_list;
+    real **val_list;
+
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2, req3, req4;
+    int flag1, flag2;
+    MPI_Status stat1, stat2, stat3, stat4;
+    const neighbor_proc *nbr1, *nbr2;
+    int *j_send, *j_recv1, *j_recv2;
+    real *val_send, *val_recv1, *val_recv2;
+
+    real start;
+
+    /* start = Get_Time( ); */
+
+    comm = mpi_data->world;
+
+    if ( *A_app_inv == NULL)
+    {
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, comm );
+    }
+
+    else if ( (*A_app_inv)->m < A_spar_patt->m )
+    {
+        Deallocate_Matrix( *A_app_inv );
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, comm );
+    }
+
+    pos_x = NULL;
+    X = NULL;
+
+    row_nnz = NULL;
+    j_list = NULL;
+    val_list = NULL;
+
+    j_send = NULL;
+    val_send = NULL;
+    j_recv1 = NULL;
+    j_recv2 = NULL;
+    val_recv1 = NULL;
+    val_recv2 = NULL;
+
+    row_nnz = (int *) malloc( sizeof(int) * system->total_cap );
+
+    j_list = (int **) malloc( sizeof(int *) * system->N );
+    val_list = (real **) malloc( sizeof(real *) * system->N );
+
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        row_nnz[i] = 0;
+    }
+
+    /* mark the atoms that already have their row stored in the local matrix */
+    for ( i = 0; i < system->n; ++i )
+    {
+        row_nnz[i] = A->end[i] - A->start[i];
+    }
+
+    /* Announce the nnz's in each row that will be communicated later */
+    scale = sizeof(int) / sizeof(void);
+    Dist( system, mpi_data, row_nnz, MPI_INT, scale, int_packer );
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+
+    /*  use a Dist-like approach to send the row information */
+    for ( d = 0; d < 3; ++d)
+    {
+        flag1 = 0;
+        flag2 = 0;
+        cnt = 0;
+
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+        if ( nbr1->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                if( i >= A->n)
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag1 = 1;
+                j_recv1 = (int *) malloc( sizeof(int) * cnt );
+                val_recv1 = (real *) malloc( sizeof(real) * cnt );
+                MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
+                MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );
+            }
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+        if ( nbr1->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag2 = 1;
+                j_recv2 = (int *) malloc( sizeof(int) * cnt );
+                val_recv2 = (real *) malloc( sizeof(real) * cnt );
+                MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
+                MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );
+            }
+        }
+
+        /* send both messages in dimension d */
+        if( out_bufs[2 * d].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d].index[i] ];
+            }
+
+            if( cnt )
+            {
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+                {
+                    if( out_bufs[2 * d].index[i] < A->n )
+                    {
+                        for( pj = A->start[ out_bufs[2 * d].index[i] ]; pj < A->end[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
+            }
+        }
+
+        if( out_bufs[2 * d + 1].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d + 1].index[i] ];
+            }
+
+            if( cnt )
+            {
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+                {
+                    if( out_bufs[2 * d + 1].index[i] < A->n )
+                    {
+                        for( pj = A->start[ out_bufs[2 * d + 1].index[i] ]; pj < A->end[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm );
+            }
+
+        }
+
+        if( flag1 )
+        {
+            MPI_Wait( &req1, &stat1 );
+            MPI_Wait( &req2, &stat2 );
+
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv1[cnt];
+                        val_list[i][pj] = val_recv1[cnt];
+                        cnt++;
+                    }
+                }
+            }
+        }
+
+        if( flag2 )
+        {
+            MPI_Wait( &req3, &stat3 );
+            MPI_Wait( &req4, &stat4 );
+
+
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                if( i >= A->n )
+                {
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv2[cnt];
+                        val_list[i][pj] = val_recv2[cnt];
+                        cnt++;
+                    }
+                }
+            }
+        }
+    }
+
+    X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+    pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+
+    for ( i = 0; i < A_spar_patt->n; ++i )
+    {
+        N = 0;
+        M = 0;
+        for ( k = 0; k <= system->bigN; ++k )
+        {
+            X[k] = 0;
+            pos_x[k] = 0;
+        }
+
+        /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+            j_temp = A_spar_patt->entries[pj].j;
+            atom = &system->my_atoms[j_temp];
+            ++N;
+
+            /* for each of those indices
+             *              * search through the row of full A of that index */
+
+            /* the case where the local matrix has that index's row */
+            if( j_temp < A->n )
+            {
+                for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    atom = &system->my_atoms[ A->entries[k].j ];
+                    X[atom->orig_id] = 1;
+                }
+            }
+
+            /* the case where we communicated that index's row */
+            else
+            {
+                for ( k = 0; k < row_nnz[j_temp]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[ j_list[j_temp][k] ] = 1;
+                }
+            }
+        }
+
+        /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
+        identity_pos = M;
+        atom = &system->my_atoms[ i ];
+        atom_pos = atom->orig_id;
+
+        for ( k = 0; k <= system->bigN; k++)
+        {
+            if ( X[k] != 0 )
+            {
+                pos_x[k] = M;
+                if ( k == atom_pos )
+                {
+                    identity_pos = M;
+                }
+                ++M;
+            }
+        }
+
+        /* allocate memory for NxM dense matrix */
+        dense_matrix = (real *) malloc( sizeof(real) * N * M );
+
+        /* fill in the entries of dense matrix */
+        for ( d_j = 0; d_j < N; ++d_j)
+        {
+            /* all rows are initialized to zero */
+            for ( d_i = 0; d_i < M; ++d_i )
+            {
+                dense_matrix[d_i * N + d_j] = 0.0;
+            }
+            /* change the value if any of the column indices is seen */
+
+            /* it is in the original list */
+            local_pos = A_spar_patt->entries[ A_spar_patt->start[i] + d_j ].j;
+            if( local_pos < 0 || local_pos >= system->N )
+            {
+                fprintf( stdout, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n");
+                fflush( stdout );
+            }
+            if( local_pos < A->n )
+            {
+                for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i )
+                {
+                    atom = &system->my_atoms[ A->entries[d_i].j ];
+                    if (pos_x[ atom->orig_id ] >= M || d_j >=  N )
+                    {
+                        fprintf( stdout, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, orig_id = %d, i =  %d, j = %d, M = %d N = %d\n", atom->orig_id, pos_x[ atom->orig_id ], d_j, M, N );
+                        fflush( stdout );
+                    }
+                    if ( X[ atom->orig_id ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ atom->orig_id ] * N + d_j ] = A->entries[d_i].val;
+                    }
+                }
+            }
+            else
+            {
+                for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i )
+                {
+                    if (pos_x[ j_list[local_pos][d_i] ] >= M || d_j  >= N )
+                    {
+                        fprintf( stdout, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, %d %d\n", pos_x[ j_list[local_pos][d_i] ], d_j);
+                        fflush( stdout );
+                    }
+                    if ( X[ j_list[local_pos][d_i] ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i];
+                    }
+                }
+            }
+        }
+
+        /* create the right hand side of the linear equation
+         *          * that is the full column of the identity matrix */
+        e_j = (real *) malloc( sizeof(real) * M );
+
+        for ( k = 0; k < M; ++k )
+        {
+            e_j[k] = 0.0;
+        }
+        e_j[identity_pos] = 1.0;
+
+        /* Solve the overdetermined system AX = B through the least-squares problem:
+         *          *          * min ||B - AX||_2 */
+        m = M;
+        n = N;
+        nrhs = 1;
+        lda = N;
+        ldb = nrhs;
+
+        info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                e_j, ldb );
+
+        /* Check for the full rank */
+        if ( info > 0 )
+        {
+            fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
+            fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+            fprintf( stderr, "the least squares solution could not be computed.\n" );
+            exit( INVALID_INPUT );
+        }
+
+        /* accumulate the resulting vector to build A_app_inv */
+        (*A_app_inv)->start[i] = A_spar_patt->start[i];
+        (*A_app_inv)->end[i] = A_spar_patt->end[i];
+        for ( k = (*A_app_inv)->start[i]; k < (*A_app_inv)->end[i]; ++k)
+        {
+            (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j;
+            (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]];
+        }
+        free( dense_matrix );
+        free( e_j );
+    }
+
+    free( pos_x);
+    free( X );
+
+    /* return Get_Timing_Info( start ); */
+}
 
 void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 {
@@ -52,23 +868,23 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
 
-            b[i][0] += H * x[j][0];
-            b[i][1] += H * x[j][1];
+                b[i][0] += H * x[j][0];
+                b[i][1] += H * x[j][1];
 
 #if defined(HALF_LIST)
-            // comment out for tryQEq
-            //if( j < A->n ) {
-            b[j][0] += H * x[i][0];
-            b[j][1] += H * x[i][1];
-            //}
+                // comment out for tryQEq
+                //if( j < A->n ) {
+                b[j][0] += H * x[i][0];
+                b[j][1] += H * x[i][1];
+                //}
 #endif
-        }
+            }
     }
 }
 
@@ -285,18 +1101,18 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            
-            b[i] += H * x[j];
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
+
+                b[i] += H * x[j];
 #if defined(HALF_LIST)
-            //if( j < A->n ) // comment out for tryQEq
-            b[j] += H * x[i];
+                //if( j < A->n ) // comment out for tryQEq
+                b[j] += H * x[i];
 #endif
-        }
+            }
     }
 }
 
@@ -307,23 +1123,23 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec_full( const sparse_matrix * const A,
-                                const real * const x, real * const b )
+        const real * const x, real * const b )
 {
     //TODO: implement full SpMV in MPI
-//    int i, pj;
-//
-//    Vector_MakeZero( b, A->n );
-//
-//#ifdef _OPENMP
-//    #pragma omp for schedule(static)
-//#endif
-//    for ( i = 0; i < A->n; ++i )
-//    {
-//        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-//        {
-//            b[i] += A->val[pj] * x[A->j[pj]];
-//        }
-//    }
+    //    int i, pj;
+    //
+    //    Vector_MakeZero( b, A->n );
+    //
+    //#ifdef _OPENMP
+    //    #pragma omp for schedule(static)
+    //#endif
+    //    for ( i = 0; i < A->n; ++i )
+    //    {
+    //        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+    //        {
+    //            b[i] += A->val[pj] * x[A->j[pj]];
+    //        }
+    //    }
 }
 
 
@@ -364,7 +1180,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
     }
     //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//    Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
+    //    Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
 
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
@@ -404,7 +1220,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
         }
         //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//        Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
+        //        Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
 
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
@@ -465,14 +1281,14 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
 
     sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
-                            mpi_data->world );
+            mpi_data->world );
     sig0 = sig_new;
 #if defined(DEBUG)
     //if( system->my_rank == MASTER_NODE ) {
     fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
-             system->my_rank, sqrt(sig_new),
-             Parallel_Norm(workspace->d, system->n, mpi_data->world),
-             Parallel_Norm(workspace->q, system->n, mpi_data->world) );
+            system->my_rank, sqrt(sig_new),
+            Parallel_Norm(workspace->d, system->n, mpi_data->world),
+            Parallel_Norm(workspace->q, system->n, mpi_data->world) );
     //Vector_Print( stderr, "d", workspace->d, system->N );
     //Vector_Print( stderr, "q", workspace->q, system->N );
     //}
@@ -606,9 +1422,9 @@ void Backward_Subs( sparse_matrix *U, real *y, real *x )
 
 
 int PCG( reax_system *system, storage *workspace,
-         sparse_matrix *H, real *b, real tol,
-         sparse_matrix *L, sparse_matrix *U, real *x,
-         mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol,
+        sparse_matrix *L, sparse_matrix *U, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, me, n, N, scale;
     real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
@@ -642,8 +1458,8 @@ int PCG( reax_system *system, storage *workspace,
     {
         fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
         fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
-                 Parallel_Norm(workspace->d, n, world),
-                 Parallel_Norm(workspace->q, n, world) );
+                Parallel_Norm(workspace->d, n, world),
+                Parallel_Norm(workspace->q, n, world) );
     }
     MPI_Barrier( world );
 #endif
@@ -694,7 +1510,7 @@ int PCG( reax_system *system, storage *workspace,
 
 #if defined(OLD_STUFF)
 int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
-         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j;
     real tmp, alpha, beta, b_norm;
@@ -787,7 +1603,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -808,10 +1624,10 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
 
         Vector_Sum( workspace->v[0],
-                    1.,  workspace->b_prc, -1., workspace->b_prm, N );
+                1.,  workspace->b_prc, -1., workspace->b_prm, N );
         workspace->g[0] = Norm( workspace->v[0], N );
         Vector_Scale( workspace->v[0],
-                      1. / workspace->g[0], workspace->v[0], N );
+                1. / workspace->g[0], workspace->v[0], N );
 
         // fprintf( stderr, "%10.6f\n", workspace->g[0] );
 
@@ -830,12 +1646,12 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             {
                 workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
                 Vector_Add( workspace->v[j + 1],
-                            -workspace->h[i][j], workspace->v[i], N );
+                        -workspace->h[i][j], workspace->v[i], N );
             }
 
             workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
             Vector_Scale( workspace->v[j + 1],
-                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
+                    1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
             // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
 
             /* Givens rotations on the H matrix to make it U */
@@ -849,9 +1665,9 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
                 }
 
                 tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                        workspace->hs[i] * workspace->h[i + 1][j];
+                    workspace->hs[i] * workspace->h[i + 1][j];
                 tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                       workspace->hc[i] * workspace->h[i + 1][j];
+                    workspace->hc[i] * workspace->h[i + 1][j];
 
                 workspace->h[i][j] = tmp1;
                 workspace->h[i + 1][j] = tmp2;
@@ -895,7 +1711,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
       workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
     fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+            itr, j, fabs( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
@@ -908,8 +1724,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES_HouseHolder( reax_system *system, storage *workspace,
-                       sparse_matrix *H, real *b, real tol, real *x,
-                       mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1055,7 +1871,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
     //          workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
     fprintf( fout, "GMRES outer:%d  inner:%d iters, |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+            itr, j, fabs( workspace->g[j] ) / bnorm );
 
     if ( itr >= MAX_ITR )
     {
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 87c2f0ad..292a02ca 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -24,6 +24,12 @@
 
 #include "reax_types.h"
 
+void setup_sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
+        sparse_matrix *, sparse_matrix **, int, double );
+
+void sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
+        sparse_matrix*, sparse_matrix*, sparse_matrix** );
+
 int GMRES( reax_system*, storage*, sparse_matrix*,
            real*, real, real*, mpi_datatypes*, FILE* );
 int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index c8a348c7..a725a2d8 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -41,7 +41,7 @@ void Sort_Matrix_Rows( sparse_matrix *A )
         si = A->start[i];
         ei = A->end[i];
         qsort( &(A->entries[si]), ei - si,
-               sizeof(sparse_matrix_entry), compare_matrix_entry );
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
     }
 }
 
@@ -103,7 +103,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
 
 
 void ICHOLT( sparse_matrix *A, real *droptol,
-             sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
     sparse_matrix_entry tmp[1000];
     int i, j, pj, k1, k2, tmptop, Utop;
@@ -231,89 +231,89 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-                  control_params *control,  storage *workspace,
-                  mpi_datatypes *mpi_data )
+        control_params *control,  storage *workspace,
+        mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
 
     /*if( (data->step - data->prev_steps) % control->refactor == 0 ||
-        workspace->L == NULL ) {
-      //Print_Linear_System( system, control, workspace, data->step );
-      Sort_Matrix_Rows( workspace->H );
-      fprintf( stderr, "H matrix sorted\n" );
-      Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
-      fprintf( stderr, "drop tolerances calculated\n" );
-      if( workspace->L == NULL ) {
-        fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
-
-        if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
-      Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
+      workspace->L == NULL ) {
+    //Print_Linear_System( system, control, workspace, data->step );
+    Sort_Matrix_Rows( workspace->H );
+    fprintf( stderr, "H matrix sorted\n" );
+    Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
+    fprintf( stderr, "drop tolerances calculated\n" );
+    if( workspace->L == NULL ) {
+    fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
+
+    if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
+    Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
     fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
     MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
-        }
+    }
 
-        workspace->L->n = workspace->H->n;
-        workspace->U->n = workspace->H->n;
-    #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: n=%d, fillin = %d\n",
-           system->my_rank, workspace->L->n, fillin );
-        fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
-                 system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
-    #endif
-      }
-
-      ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
-    #if defined(DEBUG_FOCUS)
-      fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
-      //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-      //Print_Sparse_Matrix2( workspace->L, fname );
-      //Print_Sparse_Matrix( U );
-    #endif
-    }*/
+    workspace->L->n = workspace->H->n;
+    workspace->U->n = workspace->H->n;
+#if defined(DEBUG_FOCUS)
+fprintf( stderr, "p%d: n=%d, fillin = %d\n",
+system->my_rank, workspace->L->n, fillin );
+fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
+system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
+#endif
+}
+
+ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+#if defined(DEBUG_FOCUS)
+fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
+    //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+    //Print_Sparse_Matrix2( workspace->L, fname );
+    //Print_Sparse_Matrix( U );
+#endif
+}*/
 
     //TODO: fill in code for setting up and computing SAI, see sPuReMD code,
     //  and remove diagonal preconditioner computation below (workspace->Hdia_inv)
-//    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//            &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//            control->cm_solver_pre_comp_sai_thres );
+    //    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+    //            &workspace->H_spar_patt_full, &workspace->H_app_inv,
+    //            control->cm_solver_pre_comp_sai_thres );
 
     for ( i = 0; i < system->n; ++i )
-    {
-        atom = &( system->my_atoms[i] );
-
-        /* init pre-conditioner for H and init solution vectors */
-        workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
-        workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b_t[i] = -1.0;
-        workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b[i][1] = -1.0;
-
-        /* linear extrapolation for s and for t */
-        // newQEq: no extrapolation!
-        //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
-        //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
-        //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
-        //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
-
-        /* quadratic extrapolation for s and t */
-        // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-        // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-        //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-        workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-
-        /* cubic extrapolation for s and t */
-        workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
-        //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
-
-        // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
-    }
+{
+    atom = &( system->my_atoms[i] );
+
+    /* init pre-conditioner for H and init solution vectors */
+    workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
+    workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+    workspace->b_t[i] = -1.0;
+    workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+    workspace->b[i][1] = -1.0;
+
+    /* linear extrapolation for s and for t */
+    // newQEq: no extrapolation!
+    //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
+    //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
+    //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
+    //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
+
+    /* quadratic extrapolation for s and t */
+    // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+    // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+    //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+
+    /* cubic extrapolation for s and t */
+    workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
+    //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
+
+    // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
+}
 }
 
 
 
 void Calculate_Charges( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int        i, scale;
     real       u;//, s_sum, t_sum;
@@ -365,17 +365,37 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 }
 
 
+static void Setup_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    Sort_Matrix_Rows( workspace->H );
+
+    //TODO: add sai filtering value, which will be passed as the last parameter
+    setup_sparse_approx_inverse( system, workspace, mpi_data, workspace->H, &workspace->H_spar_patt, 
+            control->nprocs, 0.1 );
+}
+
+static void Compute_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+    sparse_approx_inverse( system, workspace, mpi_data,
+            workspace->H, workspace->H_spar_patt, &workspace->H_app_inv );
+#else
+    fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+    exit( INVALID_INPUT );
+#endif
+}
+
 void QEq( reax_system *system, control_params *control, simulation_data *data,
-          storage *workspace, output_controls *out_control,
-          mpi_datatypes *mpi_data )
+        storage *workspace, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int j, s_matvecs, t_matvecs;
 
     Init_MatVec( system, data, control, workspace, mpi_data );
 
-    //if( data->step == 50010 ) {
-    //  Print_Linear_System( system, control, workspace, data->step );
-    // }
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
     //Print_Linear_System( system, control, workspace, data->step );
@@ -385,10 +405,20 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //          control->q_err, workspace->x, mpi_data, out_control->log);
     //t_matvecs = 0;
 
+#if defined(SAI_PRECONDITIONER)
+    if( control->refactor > 0 && ((data->step - data->prev_steps) % control->refactor == 0))
+    {
+        Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+
+        Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+    }
+#endif
+
+
     for ( j = 0; j < system->n; ++j )
         workspace->s[j] = workspace->x[j][0];
     s_matvecs = CG(system, workspace, workspace->H, workspace->b_s,//newQEq sCG
-                   control->q_err, workspace->s, mpi_data, out_control->log );
+            control->q_err, workspace->s, mpi_data, out_control->log );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][0] = workspace->s[j];
 
@@ -402,7 +432,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     for ( j = 0; j < system->n; ++j )
         workspace->t[j] = workspace->x[j][1];
     t_matvecs = CG(system, workspace, workspace->H, workspace->b_t,//newQEq sCG
-                   control->q_err, workspace->t, mpi_data, out_control->log );
+            control->q_err, workspace->t, mpi_data, out_control->log );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][1] = workspace->t[j];
 
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 0e58175e..81b55c64 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -36,10 +36,10 @@
 
 #define PURE_REAX
 //#define LAMMPS_REAX
-
+#define SAI_PRECONDITIONER
+//#define HALF_LIST
 //#define DEBUG
 //#define DEBUG_FOCUS
-//#define HALF_LIST
 //#define TEST_ENERGY
 //#define TEST_FORCES
 //#define CG_PERFORMANCE
-- 
GitLab