From 53be74a64e3f033bfa0d86655951f24816aec56b Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Wed, 17 Jan 2018 11:34:35 -0500
Subject: [PATCH] sPuReMD: implement sparse approximate inverse
 preconditioning.

---
 .gitignore                |   10 +-
 ar-lib                    |  270 -----
 depcomp                   |  791 -------------
 environ/param.gpu.water   |    1 +
 sPuReMD/configure.ac      |   39 +-
 sPuReMD/src/analyze.c     |   27 +-
 sPuReMD/src/charges.c     | 1484 ++----------------------
 sPuReMD/src/control.c     |    6 +
 sPuReMD/src/geo_tools.c   |    4 +-
 sPuReMD/src/init_md.c     |   16 +-
 sPuReMD/src/lin_alg.c     | 2235 +++++++++++++++++++++++++++++++------
 sPuReMD/src/lin_alg.h     |   35 +
 sPuReMD/src/mytypes.h     |    4 +
 sPuReMD/src/print_utils.c |   26 +-
 sPuReMD/src/restart.c     |    9 +-
 sPuReMD/src/traj.c        |   36 +-
 16 files changed, 2167 insertions(+), 2826 deletions(-)
 delete mode 100755 ar-lib
 delete mode 100755 depcomp

diff --git a/.gitignore b/.gitignore
index 0e82e4b2..71b2700c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,10 +33,12 @@ tags
 .~lock.*
 
 # Autotools
-aclocal.m4
-autom4te.cache
 .deps
 .dirstamp
+aclocal.m4
+autom4te.cache
+ar-lib
+compile
 confdefs.h
 config.log
 config.guess
@@ -45,9 +47,13 @@ config.h.in
 config.status
 config.sub
 configure
+depcomp
+install-sh
 libtool
+ltmain.sh
 Makefile
 Makefile.in
+missing
 stamp-h1
 test-driver
 *.lo
diff --git a/ar-lib b/ar-lib
deleted file mode 100755
index 463b9ec0..00000000
--- a/ar-lib
+++ /dev/null
@@ -1,270 +0,0 @@
-#! /bin/sh
-# Wrapper for Microsoft lib.exe
-
-me=ar-lib
-scriptversion=2012-03-01.08; # UTC
-
-# Copyright (C) 2010-2014 Free Software Foundation, Inc.
-# Written by Peter Rosin <peda@lysator.liu.se>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-
-# func_error message
-func_error ()
-{
-  echo "$me: $1" 1>&2
-  exit 1
-}
-
-file_conv=
-
-# func_file_conv build_file
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv in
-	mingw)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_at_file at_file operation archive
-# Iterate over all members in AT_FILE performing OPERATION on ARCHIVE
-# for each of them.
-# When interpreting the content of the @FILE, do NOT use func_file_conv,
-# since the user would need to supply preconverted file names to
-# binutils ar, at least for MinGW.
-func_at_file ()
-{
-  operation=$2
-  archive=$3
-  at_file_contents=`cat "$1"`
-  eval set x "$at_file_contents"
-  shift
-
-  for member
-  do
-    $AR -NOLOGO $operation:"$member" "$archive" || exit $?
-  done
-}
-
-case $1 in
-  '')
-     func_error "no command.  Try '$0 --help' for more information."
-     ;;
-  -h | --h*)
-    cat <<EOF
-Usage: $me [--help] [--version] PROGRAM ACTION ARCHIVE [MEMBER...]
-
-Members may be specified in a file named with @FILE.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "$me, version $scriptversion"
-    exit $?
-    ;;
-esac
-
-if test $# -lt 3; then
-  func_error "you must specify a program, an action and an archive"
-fi
-
-AR=$1
-shift
-while :
-do
-  if test $# -lt 2; then
-    func_error "you must specify a program, an action and an archive"
-  fi
-  case $1 in
-    -lib | -LIB \
-    | -ltcg | -LTCG \
-    | -machine* | -MACHINE* \
-    | -subsystem* | -SUBSYSTEM* \
-    | -verbose | -VERBOSE \
-    | -wx* | -WX* )
-      AR="$AR $1"
-      shift
-      ;;
-    *)
-      action=$1
-      shift
-      break
-      ;;
-  esac
-done
-orig_archive=$1
-shift
-func_file_conv "$orig_archive"
-archive=$file
-
-# strip leading dash in $action
-action=${action#-}
-
-delete=
-extract=
-list=
-quick=
-replace=
-index=
-create=
-
-while test -n "$action"
-do
-  case $action in
-    d*) delete=yes  ;;
-    x*) extract=yes ;;
-    t*) list=yes    ;;
-    q*) quick=yes   ;;
-    r*) replace=yes ;;
-    s*) index=yes   ;;
-    S*)             ;; # the index is always updated implicitly
-    c*) create=yes  ;;
-    u*)             ;; # TODO: don't ignore the update modifier
-    v*)             ;; # TODO: don't ignore the verbose modifier
-    *)
-      func_error "unknown action specified"
-      ;;
-  esac
-  action=${action#?}
-done
-
-case $delete$extract$list$quick$replace,$index in
-  yes,* | ,yes)
-    ;;
-  yesyes*)
-    func_error "more than one action specified"
-    ;;
-  *)
-    func_error "no action specified"
-    ;;
-esac
-
-if test -n "$delete"; then
-  if test ! -f "$orig_archive"; then
-    func_error "archive not found"
-  fi
-  for member
-  do
-    case $1 in
-      @*)
-        func_at_file "${1#@}" -REMOVE "$archive"
-        ;;
-      *)
-        func_file_conv "$1"
-        $AR -NOLOGO -REMOVE:"$file" "$archive" || exit $?
-        ;;
-    esac
-  done
-
-elif test -n "$extract"; then
-  if test ! -f "$orig_archive"; then
-    func_error "archive not found"
-  fi
-  if test $# -gt 0; then
-    for member
-    do
-      case $1 in
-        @*)
-          func_at_file "${1#@}" -EXTRACT "$archive"
-          ;;
-        *)
-          func_file_conv "$1"
-          $AR -NOLOGO -EXTRACT:"$file" "$archive" || exit $?
-          ;;
-      esac
-    done
-  else
-    $AR -NOLOGO -LIST "$archive" | sed -e 's/\\/\\\\/g' | while read member
-    do
-      $AR -NOLOGO -EXTRACT:"$member" "$archive" || exit $?
-    done
-  fi
-
-elif test -n "$quick$replace"; then
-  if test ! -f "$orig_archive"; then
-    if test -z "$create"; then
-      echo "$me: creating $orig_archive"
-    fi
-    orig_archive=
-  else
-    orig_archive=$archive
-  fi
-
-  for member
-  do
-    case $1 in
-    @*)
-      func_file_conv "${1#@}"
-      set x "$@" "@$file"
-      ;;
-    *)
-      func_file_conv "$1"
-      set x "$@" "$file"
-      ;;
-    esac
-    shift
-    shift
-  done
-
-  if test -n "$orig_archive"; then
-    $AR -NOLOGO -OUT:"$archive" "$orig_archive" "$@" || exit $?
-  else
-    $AR -NOLOGO -OUT:"$archive" "$@" || exit $?
-  fi
-
-elif test -n "$list"; then
-  if test ! -f "$orig_archive"; then
-    func_error "archive not found"
-  fi
-  $AR -NOLOGO -LIST "$archive" || exit $?
-fi
diff --git a/depcomp b/depcomp
deleted file mode 100755
index fc98710e..00000000
--- a/depcomp
+++ /dev/null
@@ -1,791 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2013-05-30.07; # UTC
-
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
-
-case $1 in
-  '')
-    echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-    exit 1;
-    ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
-  depmode     Dependency tracking mode.
-  source      Source file read by 'PROGRAMS ARGS'.
-  object      Object file output by 'PROGRAMS ARGS'.
-  DEPDIR      directory where to store dependencies.
-  depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputting dependencies.
-  libtool     Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "depcomp $scriptversion"
-    exit $?
-    ;;
-esac
-
-# Get the directory component of the given path, and save it in the
-# global variables '$dir'.  Note that this directory component will
-# be either empty or ending with a '/' character.  This is deliberate.
-set_dir_from ()
-{
-  case $1 in
-    */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;;
-      *) dir=;;
-  esac
-}
-
-# Get the suffix-stripped basename of the given path, and save it the
-# global variable '$base'.
-set_base_from ()
-{
-  base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'`
-}
-
-# If no dependency file was actually created by the compiler invocation,
-# we still have to create a dummy depfile, to avoid errors with the
-# Makefile "include basename.Plo" scheme.
-make_dummy_depfile ()
-{
-  echo "#dummy" > "$depfile"
-}
-
-# Factor out some common post-processing of the generated depfile.
-# Requires the auxiliary global variable '$tmpdepfile' to be set.
-aix_post_process_depfile ()
-{
-  # If the compiler actually managed to produce a dependency file,
-  # post-process it.
-  if test -f "$tmpdepfile"; then
-    # Each line is of the form 'foo.o: dependency.h'.
-    # Do two passes, one to just change these to
-    #   $object: dependency.h
-    # and one to simply output
-    #   dependency.h:
-    # which is needed to avoid the deleted-header problem.
-    { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile"
-      sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile"
-    } > "$depfile"
-    rm -f "$tmpdepfile"
-  else
-    make_dummy_depfile
-  fi
-}
-
-# A tabulation character.
-tab='	'
-# A newline character.
-nl='
-'
-# Character ranges might be problematic outside the C locale.
-# These definitions help.
-upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ
-lower=abcdefghijklmnopqrstuvwxyz
-digits=0123456789
-alpha=${upper}${lower}
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
-  echo "depcomp: Variables source, object and depmode must be set" 1>&2
-  exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
-  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Avoid interferences from the environment.
-gccflag= dashmflag=
-
-# Some modes work just like other modes, but use different flags.  We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write.  Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
-  # HP compiler uses -M and no extra arg.
-  gccflag=-M
-  depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
-  # This is just like dashmstdout with a different argument.
-  dashmflag=-xM
-  depmode=dashmstdout
-fi
-
-cygpath_u="cygpath -u -f -"
-if test "$depmode" = msvcmsys; then
-  # This is just like msvisualcpp but w/o cygpath translation.
-  # Just convert the backslash-escaped backslashes to single forward
-  # slashes to satisfy depend.m4
-  cygpath_u='sed s,\\\\,/,g'
-  depmode=msvisualcpp
-fi
-
-if test "$depmode" = msvc7msys; then
-  # This is just like msvc7 but w/o cygpath translation.
-  # Just convert the backslash-escaped backslashes to single forward
-  # slashes to satisfy depend.m4
-  cygpath_u='sed s,\\\\,/,g'
-  depmode=msvc7
-fi
-
-if test "$depmode" = xlc; then
-  # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information.
-  gccflag=-qmakedep=gcc,-MF
-  depmode=gcc
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff.  Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am.  Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
-  for arg
-  do
-    case $arg in
-    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
-    *)  set fnord "$@" "$arg" ;;
-    esac
-    shift # fnord
-    shift # $arg
-  done
-  "$@"
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  mv "$tmpdepfile" "$depfile"
-  ;;
-
-gcc)
-## Note that this doesn't just cater to obsosete pre-3.x GCC compilers.
-## but also to in-use compilers like IMB xlc/xlC and the HP C compiler.
-## (see the conditional assignment to $gccflag above).
-## There are various ways to get dependency output from gcc.  Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-##   up in a subdir.  Having to rename by hand is ugly.
-##   (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).  Also, it might not be
-##   supported by the other compilers which use the 'gcc' depmode.
-## - Using -M directly means running the compiler twice (even worse
-##   than renaming).
-  if test -z "$gccflag"; then
-    gccflag=-MD,
-  fi
-  "$@" -Wp,"$gccflag$tmpdepfile"
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  # The second -e expression handles DOS-style file names with drive
-  # letters.
-  sed -e 's/^[^:]*: / /' \
-      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the "deleted header file" problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header).  We avoid this by adding
-## dummy dependencies for each header file.  Too bad gcc doesn't do
-## this for us directly.
-## Some versions of gcc put a space before the ':'.  On the theory
-## that the space means something, we add a space to the output as
-## well.  hp depmode also adds that space, but also prefixes the VPATH
-## to the object.  Take care to not repeat it in the output.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-  tr ' ' "$nl" < "$tmpdepfile" \
-    | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
-    | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-sgi)
-  if test "$libtool" = yes; then
-    "$@" "-Wp,-MDupdate,$tmpdepfile"
-  else
-    "$@" -MDupdate "$tmpdepfile"
-  fi
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-
-  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
-    echo "$object : \\" > "$depfile"
-    # Clip off the initial element (the dependent).  Don't try to be
-    # clever and replace this with sed code, as IRIX sed won't handle
-    # lines with more than a fixed number of characters (4096 in
-    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
-    # the IRIX cc adds comments like '#:fec' to the end of the
-    # dependency line.
-    tr ' ' "$nl" < "$tmpdepfile" \
-      | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \
-      | tr "$nl" ' ' >> "$depfile"
-    echo >> "$depfile"
-    # The second pass generates a dummy entry for each header file.
-    tr ' ' "$nl" < "$tmpdepfile" \
-      | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-      >> "$depfile"
-  else
-    make_dummy_depfile
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-xlc)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-aix)
-  # The C for AIX Compiler uses -M and outputs the dependencies
-  # in a .u file.  In older versions, this file always lives in the
-  # current directory.  Also, the AIX compiler puts '$object:' at the
-  # start of each line; $object doesn't have directory information.
-  # Version 6 uses the directory in both cases.
-  set_dir_from "$object"
-  set_base_from "$object"
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$base.u
-    tmpdepfile3=$dir.libs/$base.u
-    "$@" -Wc,-M
-  else
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$dir$base.u
-    tmpdepfile3=$dir$base.u
-    "$@" -M
-  fi
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-    exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  aix_post_process_depfile
-  ;;
-
-tcc)
-  # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26
-  # FIXME: That version still under development at the moment of writing.
-  #        Make that this statement remains true also for stable, released
-  #        versions.
-  # It will wrap lines (doesn't matter whether long or short) with a
-  # trailing '\', as in:
-  #
-  #   foo.o : \
-  #    foo.c \
-  #    foo.h \
-  #
-  # It will put a trailing '\' even on the last line, and will use leading
-  # spaces rather than leading tabs (at least since its commit 0394caf7
-  # "Emit spaces for -MD").
-  "$@" -MD -MF "$tmpdepfile"
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'.
-  # We have to change lines of the first kind to '$object: \'.
-  sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile"
-  # And for each line of the second kind, we have to emit a 'dep.h:'
-  # dummy dependency, to avoid the deleted-header problem.
-  sed -n -e 's|^  *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-## The order of this option in the case statement is important, since the
-## shell code in configure will try each of these formats in the order
-## listed in this file.  A plain '-MD' option would be understood by many
-## compilers, so we must ensure this comes after the gcc and icc options.
-pgcc)
-  # Portland's C compiler understands '-MD'.
-  # Will always output deps to 'file.d' where file is the root name of the
-  # source file under compilation, even if file resides in a subdirectory.
-  # The object file name does not affect the name of the '.d' file.
-  # pgcc 10.2 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using '\' :
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-  set_dir_from "$object"
-  # Use the source, not the object, to determine the base name, since
-  # that's sadly what pgcc will do too.
-  set_base_from "$source"
-  tmpdepfile=$base.d
-
-  # For projects that build the same source file twice into different object
-  # files, the pgcc approach of using the *source* file root name can cause
-  # problems in parallel builds.  Use a locking strategy to avoid stomping on
-  # the same $tmpdepfile.
-  lockdir=$base.d-lock
-  trap "
-    echo '$0: caught signal, cleaning up...' >&2
-    rmdir '$lockdir'
-    exit 1
-  " 1 2 13 15
-  numtries=100
-  i=$numtries
-  while test $i -gt 0; do
-    # mkdir is a portable test-and-set.
-    if mkdir "$lockdir" 2>/dev/null; then
-      # This process acquired the lock.
-      "$@" -MD
-      stat=$?
-      # Release the lock.
-      rmdir "$lockdir"
-      break
-    else
-      # If the lock is being held by a different process, wait
-      # until the winning process is done or we timeout.
-      while test -d "$lockdir" && test $i -gt 0; do
-        sleep 1
-        i=`expr $i - 1`
-      done
-    fi
-    i=`expr $i - 1`
-  done
-  trap - 1 2 13 15
-  if test $i -le 0; then
-    echo "$0: failed to acquire lock after $numtries attempts" >&2
-    echo "$0: check lockdir '$lockdir'" >&2
-    exit 1
-  fi
-
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each line is of the form `foo.o: dependent.h',
-  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
-  # Do two passes, one to just change these to
-  # `$object: dependent.h' and one to simply `dependent.h:'.
-  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \
-    | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp2)
-  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
-  # compilers, which have integrated preprocessors.  The correct option
-  # to use with these is +Maked; it writes dependencies to a file named
-  # 'foo.d', which lands next to the object file, wherever that
-  # happens to be.
-  # Much of this is similar to the tru64 case; see comments there.
-  set_dir_from  "$object"
-  set_base_from "$object"
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir.libs/$base.d
-    "$@" -Wc,+Maked
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    "$@" +Maked
-  fi
-  stat=$?
-  if test $stat -ne 0; then
-     rm -f "$tmpdepfile1" "$tmpdepfile2"
-     exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile"
-    # Add 'dependent.h:' lines.
-    sed -ne '2,${
-               s/^ *//
-               s/ \\*$//
-               s/$/:/
-               p
-             }' "$tmpdepfile" >> "$depfile"
-  else
-    make_dummy_depfile
-  fi
-  rm -f "$tmpdepfile" "$tmpdepfile2"
-  ;;
-
-tru64)
-  # The Tru64 compiler uses -MD to generate dependencies as a side
-  # effect.  'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'.
-  # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-  # dependencies in 'foo.d' instead, so we check for that too.
-  # Subdirectories are respected.
-  set_dir_from  "$object"
-  set_base_from "$object"
-
-  if test "$libtool" = yes; then
-    # Libtool generates 2 separate objects for the 2 libraries.  These
-    # two compilations output dependencies in $dir.libs/$base.o.d and
-    # in $dir$base.o.d.  We have to check for both files, because
-    # one of the two compilations can be disabled.  We should prefer
-    # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-    # automatically cleaned when .libs/ is deleted, while ignoring
-    # the former would cause a distcleancheck panic.
-    tmpdepfile1=$dir$base.o.d          # libtool 1.5
-    tmpdepfile2=$dir.libs/$base.o.d    # Likewise.
-    tmpdepfile3=$dir.libs/$base.d      # Compaq CCC V6.2-504
-    "$@" -Wc,-MD
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    tmpdepfile3=$dir$base.d
-    "$@" -MD
-  fi
-
-  stat=$?
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-    exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  # Same post-processing that is required for AIX mode.
-  aix_post_process_depfile
-  ;;
-
-msvc7)
-  if test "$libtool" = yes; then
-    showIncludes=-Wc,-showIncludes
-  else
-    showIncludes=-showIncludes
-  fi
-  "$@" $showIncludes > "$tmpdepfile"
-  stat=$?
-  grep -v '^Note: including file: ' "$tmpdepfile"
-  if test $stat -ne 0; then
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  # The first sed program below extracts the file names and escapes
-  # backslashes for cygpath.  The second sed program outputs the file
-  # name when reading, but also accumulates all include files in the
-  # hold buffer in order to output them again at the end.  This only
-  # works with sed implementations that can handle large buffers.
-  sed < "$tmpdepfile" -n '
-/^Note: including file:  *\(.*\)/ {
-  s//\1/
-  s/\\/\\\\/g
-  p
-}' | $cygpath_u | sort -u | sed -n '
-s/ /\\ /g
-s/\(.*\)/'"$tab"'\1 \\/p
-s/.\(.*\) \\/\1:/
-H
-$ {
-  s/.*/'"$tab"'/
-  G
-  p
-}' >> "$depfile"
-  echo >> "$depfile" # make sure the fragment doesn't end with a backslash
-  rm -f "$tmpdepfile"
-  ;;
-
-msvc7msys)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-#nosideeffect)
-  # This comment above is used by automake to tell side-effect
-  # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove '-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  test -z "$dashmflag" && dashmflag=-M
-  # Require at least two characters before searching for ':'
-  # in the target name.  This is to cope with DOS-style filenames:
-  # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise.
-  "$@" $dashmflag |
-    sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this sed invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  tr ' ' "$nl" < "$tmpdepfile" \
-    | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
-    | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-dashXmstdout)
-  # This case only exists to satisfy depend.m4.  It is never actually
-  # run, as this mode is specially recognized in the preamble.
-  exit 1
-  ;;
-
-makedepend)
-  "$@" || exit $?
-  # Remove any Libtool call
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-  # X makedepend
-  shift
-  cleared=no eat=no
-  for arg
-  do
-    case $cleared in
-    no)
-      set ""; shift
-      cleared=yes ;;
-    esac
-    if test $eat = yes; then
-      eat=no
-      continue
-    fi
-    case "$arg" in
-    -D*|-I*)
-      set fnord "$@" "$arg"; shift ;;
-    # Strip any option that makedepend may not understand.  Remove
-    # the object too, otherwise makedepend will parse it as a source file.
-    -arch)
-      eat=yes ;;
-    -*|$object)
-      ;;
-    *)
-      set fnord "$@" "$arg"; shift ;;
-    esac
-  done
-  obj_suffix=`echo "$object" | sed 's/^.*\././'`
-  touch "$tmpdepfile"
-  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
-  rm -f "$depfile"
-  # makedepend may prepend the VPATH from the source file name to the object.
-  # No need to regex-escape $object, excess matching of '.' is harmless.
-  sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process the last invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed '1,2d' "$tmpdepfile" \
-    | tr ' ' "$nl" \
-    | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
-    | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile" "$tmpdepfile".bak
-  ;;
-
-cpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove '-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  "$@" -E \
-    | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-             -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-    | sed '$ s: \\$::' > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  cat < "$tmpdepfile" >> "$depfile"
-  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvisualcpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  IFS=" "
-  for arg
-  do
-    case "$arg" in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-        set fnord "$@"
-        shift
-        shift
-        ;;
-    *)
-        set fnord "$@" "$arg"
-        shift
-        shift
-        ;;
-    esac
-  done
-  "$@" -E 2>/dev/null |
-  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile"
-  echo "$tab" >> "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvcmsys)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-none)
-  exec "$@"
-  ;;
-
-*)
-  echo "Unknown depmode $depmode" 1>&2
-  exit 1
-  ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/environ/param.gpu.water b/environ/param.gpu.water
index eed406e7..2982676b 100644
--- a/environ/param.gpu.water
+++ b/environ/param.gpu.water
@@ -25,6 +25,7 @@ cm_solver_pre_comp_type           1             ! method used to compute precond
 cm_solver_pre_comp_refactor       1000          ! number of steps before recomputing preconditioner
 cm_solver_pre_comp_droptol        0.0           ! threshold tolerance for dropping values in preconditioner computation, if applicable
 cm_solver_pre_comp_sweeps         3             ! number of sweeps used to compute preconditioner (ILU_PAR)
+cm_solver_pre_comp_sai_thres      0.1           ! ratio of charge matrix NNZ's used to compute preconditioner (SAI)
 cm_solver_pre_app_type            1             ! method used to apply preconditioner
 cm_solver_pre_app_jacobi_iters    50            ! number of Jacobi iterations used for applying precondition, if applicable
 
diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac
index 1a59944f..ce330dc6 100644
--- a/sPuReMD/configure.ac
+++ b/sPuReMD/configure.ac
@@ -65,10 +65,10 @@ if test "x${BUILD_OPENMP}" = "xyes"; then
 	AC_OPENMP
 	if test "x${OPENMP_CFLAGS}" = "x"; then
 		AC_MSG_WARN([
-	  -----------------------------------------------
-	   Unable to find OpenMP support on this system.
-	   Building a single-threaded version.
-	  -----------------------------------------------])
+  -----------------------------------------------
+   Unable to find OpenMP support on this system.
+   Building a single-threaded version.
+  -----------------------------------------------])
 	else
 		# bug due to recent Intel compiler change (?)
 		if test "x${ax_cv_c_compiler_vendor}" = "xintel"; then
@@ -130,6 +130,37 @@ then
 	CFLAGS="${CFLAGS} ${GPROF_FLAGS}"
 fi
 
+# Check for LAPACKE
+AC_CHECK_HEADERS([mkl.h], [MKL_FOUND_HEADERS="yes"])
+if test "x${MKL_FOUND_HEADERS}" = "xyes"
+then
+	AC_SEARCH_LIBS([LAPACKE_dgels], [mkl_intel_ilp64],
+		       [MKL_FOUND_LIBS="yes"], [MKL_FOUND_LIBS="no"],
+		       [-lmkl_sequential -lmkl_core -lpthread -lm -ldl])
+	AS_IF([test "x${MKL_FOUND_LIBS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find MKL LAPACKE library.])])
+	LIBS="${LIBS} -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl"
+	AC_DEFINE([HAVE_LAPACKE_MKL], [1], [Define to 1 if you have MKL LAPACKE support enabled.])
+else
+	AC_CHECK_HEADERS([lapacke.h], [LAPACKE_FOUND_HEADERS="yes"])
+	if test "x${LAPACKE_FOUND_HEADERS}" = "xyes"
+	then
+		AC_SEARCH_LIBS([LAPACKE_dgels], [lapacke],
+			       [LAPACKE_FOUND_LIBS="yes"], [LAPACKE_FOUND_LIBS="no"],
+			       [-llapack])
+		AS_IF([test "x${LAPACKE_FOUND_LIBS}" != "xyes"],
+		      [AC_MSG_ERROR([Unable to find LAPACKE library.])])
+		LIBS="${LIBS} -llapack"
+		AC_DEFINE([HAVE_LAPACKE], [1], [Define to 1 if you have LAPACKE support enabled.])
+	else
+		AC_MSG_WARN([
+  -----------------------------------------------
+   Unable to find LAPACKE on this system.
+   Disabling support for dependent methods.
+  -----------------------------------------------])
+	fi
+fi
+
 # Tests using Google C++ testing framework (gtest)
 AC_LANG_PUSH([C++])
 AC_PROG_CXX([icpc g++ clang++ CC])
diff --git a/sPuReMD/src/analyze.c b/sPuReMD/src/analyze.c
index 4e2cbe2c..9fcd54f7 100644
--- a/sPuReMD/src/analyze.c
+++ b/sPuReMD/src/analyze.c
@@ -191,7 +191,8 @@ void Get_Molecule( int atom, molecule *m, int *mark, reax_system *system,
 }
 
 
-void Print_Molecule( reax_system *system, molecule *m, int mode, char *s )
+void Print_Molecule( reax_system *system, molecule *m, int mode, char *s,
+       size_t size )
 {
     int j, atom;
 
@@ -202,7 +203,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s )
         /* print molecule summary */
         for ( j = 0; j < MAX_ATOM_TYPES; ++j )
             if ( m->mtypes[j] )
-                sprintf( s, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] );
+                snprintf( s, size, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] );
     }
     else if ( mode == 2 )
     {
@@ -210,7 +211,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s )
         for ( j = 0; j < m->atom_count; ++j )
         {
             atom = m->atom_list[j];
-            sprintf( s, "%s%s(%d)",
+            snprintf( s, size, "%s%s(%d)",
                      s, system->reaxprm.sbp[ system->atoms[atom].type ].name, atom );
         }
     }
@@ -288,7 +289,8 @@ void Analyze_Molecules( reax_system *system, control_params *control,
                 fprintf( fout, "old molecules: " );
                 for ( i = 0; i < num_old; ++i )
                 {
-                    Print_Molecule( system, &old_molecules[i], 1, &s[0] );
+                    Print_Molecule( system, &old_molecules[i], 1, &s[0],
+                            MAX_MOLECULE_SIZE * 10 );
                     fprintf( fout, "%s\t", s );
                 }
                 fprintf( fout, "\n" );
@@ -296,7 +298,8 @@ void Analyze_Molecules( reax_system *system, control_params *control,
                 fprintf( fout, "new molecules: " );
                 for ( i = 0; i < num_new; ++i )
                 {
-                    Print_Molecule( system, &new_molecules[i], 1, &s[0] );
+                    Print_Molecule( system, &new_molecules[i], 1, &s[0],
+                            MAX_MOLECULE_SIZE * 10 );
                     fprintf( fout, "%s\t", s );
                 }
                 fprintf( fout, "\n" );
@@ -528,15 +531,15 @@ void Visit_Bonds( int atom, int *mark, int *type, reax_system *system,
 
 
 void Analyze_Fragments( reax_system *system, control_params *control,
-                        simulation_data *data, static_storage *workspace,
-                        reax_list **lists, FILE *fout, int ignore )
+        simulation_data *data, static_storage *workspace,
+        reax_list **lists, FILE *fout, int ignore )
 {
-    int  atom, i, flag;
-    int  *mark = workspace->mark;
-    int  num_fragments, num_fragment_types;
+    int atom, i, flag;
+    int *mark = workspace->mark;
+    int num_fragments, num_fragment_types;
     char fragment[MAX_ATOM_TYPES];
     char fragments[MAX_FRAGMENT_TYPES][MAX_ATOM_TYPES];
-    int  fragment_count[MAX_FRAGMENT_TYPES];
+    int fragment_count[MAX_FRAGMENT_TYPES];
     molecule m;
     reax_list *new_bonds = (*lists) + BONDS;
     //reax_list *old_bonds = (*lists) + OLD_BONDS;
@@ -554,7 +557,7 @@ void Analyze_Fragments( reax_system *system, control_params *control,
             memset( m.mtypes, 0, MAX_ATOM_TYPES * sizeof(int) );
             Visit_Bonds( atom, mark, m.mtypes, system, control, new_bonds, ignore );
             ++num_fragments;
-            Print_Molecule( system, &m, 1, fragment );
+            Print_Molecule( system, &m, 1, fragment, MAX_FRAGMENT_TYPES );
 
             /* check if a similar fragment already exists */
             flag = 0;
diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 5003d28a..0080f73c 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -32,1388 +32,6 @@
 #endif
 
 
-typedef struct
-{
-    unsigned int j;
-    real val;
-} sparse_matrix_entry;
-
-
-#if defined(TEST_MAT)
-static sparse_matrix * create_test_mat( void )
-{
-    unsigned int i, n;
-    sparse_matrix *H_test;
-
-    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    //3x3, SPD, store lower half
-    i = 0;
-    n = 0;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = 4.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = 12.;
-    ++i;
-    H_test->j[i] = 1;
-    H_test->val[i] = 37.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = -16.;
-    ++i;
-    H_test->j[i] = 1;
-    H_test->val[i] = -43.;
-    ++i;
-    H_test->j[i] = 2;
-    H_test->val[i] = 98.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-
-    return H_test;
-}
-#endif
-
-
-/* Routine used with qsort for sorting nonzeros within a sparse matrix row
- *
- * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
- */
-static int compare_matrix_entry(const void *v1, const void *v2)
-{
-    /* larger element has larger column index */
-    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
-}
-
-
-/* Routine used for sorting nonzeros within a sparse matrix row;
- *  internally, a combination of qsort and manual sorting is utilized
- *  (parallel calls to qsort when multithreading, rows mapped to threads)
- *
- * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
- */
-static void Sort_Matrix_Rows( sparse_matrix * const A )
-{
-    unsigned int i, j, si, ei;
-    sparse_matrix_entry *temp;
-
-#ifdef _OPENMP
-//    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
-#endif
-    {
-        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
-        {
-            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
-            exit( INSUFFICIENT_MEMORY );
-        }
-
-        /* sort each row of A using column indices */
-#ifdef _OPENMP
-//        #pragma omp for schedule(guided)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            si = A->start[i];
-            ei = A->start[i + 1];
-
-            for ( j = 0; j < (ei - si); ++j )
-            {
-                (temp + j)->j = A->j[si + j];
-                (temp + j)->val = A->val[si + j];
-            }
-
-            /* polymorphic sort in standard C library using column indices */
-            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
-
-            for ( j = 0; j < (ei - si); ++j )
-            {
-                A->j[si + j] = (temp + j)->j;
-                A->val[si + j] = (temp + j)->val;
-            }
-        }
-
-        sfree( temp, "Sort_Matrix_Rows::temp" );
-    }
-}
-
-
-static void Calculate_Droptol( const sparse_matrix * const A,
-        real * const droptol, const real dtol )
-{
-    int i, j, k;
-    real val;
-#ifdef _OPENMP
-    static real *droptol_local;
-    unsigned int tid;
-#endif
-
-#ifdef _OPENMP
-    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
-#endif
-    {
-#ifdef _OPENMP
-        tid = omp_get_thread_num();
-
-        #pragma omp master
-        {
-            /* keep b_local for program duration to avoid allocate/free
-             * overhead per Sparse_MatVec call*/
-            if ( droptol_local == NULL )
-            {
-                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
-                {
-                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
-            }
-        }
-
-        #pragma omp barrier
-#endif
-
-        /* init droptol to 0 */
-        for ( i = 0; i < A->n; ++i )
-        {
-#ifdef _OPENMP
-            droptol_local[tid * A->n + i] = 0.0;
-#else
-            droptol[i] = 0.0;
-#endif
-        }
-
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
-
-        /* calculate sqaure of the norm of each row */
-#ifdef _OPENMP
-        #pragma omp for schedule(static)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
-            {
-                j = A->j[k];
-                val = A->val[k];
-
-#ifdef _OPENMP
-                droptol_local[tid * A->n + i] += val * val;
-                droptol_local[tid * A->n + j] += val * val;
-#else
-                droptol[i] += val * val;
-                droptol[j] += val * val;
-#endif
-            }
-
-            // diagonal entry
-            val = A->val[k];
-#ifdef _OPENMP
-            droptol_local[tid * A->n + i] += val * val;
-#else
-            droptol[i] += val * val;
-#endif
-        }
-
-#ifdef _OPENMP
-        #pragma omp barrier
-
-        #pragma omp for schedule(static)
-        for ( i = 0; i < A->n; ++i )
-        {
-            droptol[i] = 0.0;
-            for ( k = 0; k < omp_get_num_threads(); ++k )
-            {
-                droptol[i] += droptol_local[k * A->n + i];
-            }
-        }
-
-        #pragma omp barrier
-#endif
-
-        /* calculate local droptol for each row */
-        //fprintf( stderr, "droptol: " );
-#ifdef _OPENMP
-        #pragma omp for schedule(static)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            //fprintf( stderr, "%f-->", droptol[i] );
-            droptol[i] = SQRT( droptol[i] ) * dtol;
-            //fprintf( stderr, "%f  ", droptol[i] );
-        }
-        //fprintf( stderr, "\n" );
-    }
-}
-
-
-static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
-{
-    int i, pj;
-    int fillin;
-    real val;
-
-    fillin = 0;
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i, pj, val) reduction(+: fillin)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            val = A->val[pj];
-
-            if ( FABS(val) > droptol[i] )
-            {
-                ++fillin;
-            }
-        }
-    }
-
-    return fillin + A->n;
-}
-
-
-#if defined(HAVE_SUPERLU_MT)
-static real SuperLU_Factorize( const sparse_matrix * const A,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, pj, count, *Ltop, *Utop, r;
-    sparse_matrix *A_t;
-    SuperMatrix A_S, AC_S, L_S, U_S;
-    NCformat *A_S_store;
-    SCPformat *L_S_store;
-    NCPformat *U_S_store;
-    superlumt_options_t superlumt_options;
-    pxgstrf_shared_t pxgstrf_shared;
-    pdgstrf_threadarg_t *pdgstrf_threadarg;
-    int_t nprocs;
-    fact_t fact;
-    trans_t trans;
-    yes_no_t refact, usepr;
-    real u, drop_tol;
-    real *a, *at;
-    int_t *asub, *atsub, *xa, *xat;
-    int_t *perm_c; /* column permutation vector */
-    int_t *perm_r; /* row permutations from partial pivoting */
-    void *work;
-    int_t info, lwork;
-    int_t permc_spec, panel_size, relax;
-    Gstat_t Gstat;
-    flops_t flopcnt;
-
-    /* Default parameters to control factorization. */
-#ifdef _OPENMP
-    //TODO: set as global parameter and use
-    #pragma omp parallel \
-        default(none) shared(nprocs)
-    {
-        #pragma omp master
-        {
-            /* SuperLU_MT spawns threads internally, so set and pass parameter */
-            nprocs = omp_get_num_threads();
-        }
-    }
-#else
-    nprocs = 1;
-#endif
-
-//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
-    fact = DOFACT; /* factor from scratch */
-    trans = NOTRANS;
-    refact = NO; /* first time factorization */
-    //TODO: add to control file and use the value there to set these
-    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
-    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
-    u = 1.0; /* diagonal pivoting threshold */
-    usepr = NO;
-    drop_tol = 0.0;
-    work = NULL;
-    lwork = 0;
-
-//#if defined(DEBUG)
-    fprintf( stderr, "nprocs = %d\n", nprocs );
-    fprintf( stderr, "Panel size = %d\n", panel_size );
-    fprintf( stderr, "Relax = %d\n", relax );
-//#endif
-
-    if ( !(perm_r = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for perm_r[].");
-    }
-    if ( !(perm_c = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for perm_c[].");
-    }
-    if ( !(superlumt_options.etree = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for etree[].");
-    }
-    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
-    }
-    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for part_super__h[].");
-    }
-    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
-            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
-            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
-            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
-            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
-    {
-        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    /* Set up the sparse matrix data structure for A. */
-    Transpose( A, A_t );
-
-    count = 0;
-    for ( i = 0; i < A->n; ++i )
-    {
-        xa[i] = count;
-        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-        {
-            a[count] = A->entries[pj].val;
-            asub[count] = A->entries[pj].j;
-            ++count;
-        }
-        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
-        {
-            a[count] = A_t->entries[pj].val;
-            asub[count] = A_t->entries[pj].j;
-            ++count;
-        }
-    }
-    xa[i] = count;
-
-    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
-                         &at, &atsub, &xat );
-
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6d", asub[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6.1f", a[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i <= A->n; ++i )
-        fprintf( stderr, "%6d", xa[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6d", atsub[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6.1f", at[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i <= A->n; ++i )
-        fprintf( stderr, "%6d", xat[i] );
-    fprintf( stderr, "\n" );
-
-    A_S.Stype = SLU_NC; /* column-wise, no supernode */
-    A_S.Dtype = SLU_D; /* double-precision */
-    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
-    A_S.nrow = A->n;
-    A_S.ncol = A->n;
-    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    A_S_store = (NCformat *) A_S.Store;
-    A_S_store->nnz = 2 * A->start[A->n] - A->n;
-    A_S_store->nzval = at;
-    A_S_store->rowind = atsub;
-    A_S_store->colptr = xat;
-
-    /* ------------------------------------------------------------
-       Allocate storage and initialize statistics variables.
-       ------------------------------------------------------------*/
-    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
-    StatInit( A->n, nprocs, &Gstat );
-
-    /* ------------------------------------------------------------
-       Get column permutation vector perm_c[], according to permc_spec:
-       permc_spec = 0: natural ordering
-       permc_spec = 1: minimum degree ordering on structure of A'*A
-       permc_spec = 2: minimum degree ordering on structure of A'+A
-       permc_spec = 3: approximate minimum degree for unsymmetric matrices
-       ------------------------------------------------------------*/
-    permc_spec = 0;
-    get_perm_c( permc_spec, &A_S, perm_c );
-
-    /* ------------------------------------------------------------
-       Initialize the option structure superlumt_options using the
-       user-input parameters;
-       Apply perm_c to the columns of original A to form AC.
-       ------------------------------------------------------------*/
-    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
-                  u, usepr, drop_tol, perm_c, perm_r,
-                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
-
-    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
-    fprintf( stderr, "\n" );
-
-    /* ------------------------------------------------------------
-       Compute the LU factorization of A.
-       The following routine will create nprocs threads.
-       ------------------------------------------------------------*/
-    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
-
-    fprintf( stderr, "INFO: %d\n", info );
-
-    flopcnt = 0;
-    for (i = 0; i < nprocs; ++i)
-    {
-        flopcnt += Gstat.procstat[i].fcops;
-    }
-    Gstat.ops[FACT] = flopcnt;
-
-//#if defined(DEBUG)
-    printf("\n** Result of sparse LU **\n");
-    L_S_store = (SCPformat *) L_S.Store;
-    U_S_store = (NCPformat *) U_S.Store;
-    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
-    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
-    fflush( stdout );
-//#endif
-
-    /* convert L and R from SuperLU formats to CSR */
-    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
-    memset( Utop, 0, (A->n + 1) * sizeof(int) );
-    memset( L->start, 0, (A->n + 1) * sizeof(int) );
-    memset( U->start, 0, (A->n + 1) * sizeof(int) );
-
-    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
-    fprintf( stderr, "\n" );
-
-    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
-    for ( i = 0; i < A->n; ++i )
-    {
-        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
-        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
-        //TODO: correct for SCPformat for L?
-        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
-//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
-//        {
-//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
-//        }
-        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
-        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
-        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
-        {
-            ++Utop[U_S_store->rowind[pj] + 1];
-            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
-        }
-    }
-    for ( i = 1; i <= A->n; ++i )
-    {
-//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
-        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
-//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
-//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
-    }
-    for ( i = 0; i < A->n; ++i )
-    {
-//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
-//        {
-//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
-//            L->entries[Ltop[r]].j = r;
-//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
-//            ++Ltop[r];
-//        }
-        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
-        {
-            r = U_S_store->rowind[pj];
-            U->entries[Utop[r]].j = i;
-            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
-            ++Utop[r];
-        }
-    }
-
-    /* ------------------------------------------------------------
-      Deallocate storage after factorization.
-      ------------------------------------------------------------*/
-    pxgstrf_finalize( &superlumt_options, &AC_S );
-    Deallocate_Matrix( A_t );
-    sfree( xa, "SuperLU_Factorize::xa" );
-    sfree( asub, "SuperLU_Factorize::asub" );
-    sfree( a, "SuperLU_Factorize::a" );
-    SUPERLU_FREE( perm_r );
-    SUPERLU_FREE( perm_c );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
-    SUPERLU_FREE( A_S.Store );
-    if ( lwork == 0 )
-    {
-        Destroy_SuperNode_SCP(&L_S);
-        Destroy_CompCol_NCP(&U_S);
-    }
-    else if ( lwork > 0 )
-    {
-        SUPERLU_FREE(work);
-    }
-    StatFree(&Gstat);
-
-    sfree( Utop, "SuperLU_Factorize::Utop" );
-    sfree( Ltop, "SuperLU_Factorize::Ltop" );
-
-    //TODO: return iters
-    return 0.;
-}
-#endif
-
-
-/* Diagonal (Jacobi) preconditioner computation */
-static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
-{
-    unsigned int i;
-    real start;
-
-    start = Get_Time( );
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i)
-#endif
-    for ( i = 0; i < H->n; ++i )
-    {
-        if ( H->val[H->start[i + 1] - 1] != 0.0 )
-        {
-            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
-        }
-        else
-        {
-            Hdia_inv[i] = 1.0;
-        }
-    }
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Incomplete Cholesky factorization with dual thresholding */
-static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    int *tmp_j;
-    real *tmp_val;
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val, start;
-    unsigned int *Utop;
-
-    start = Get_Time( );
-
-    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
-            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
-            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( Utop, 0, A->n * sizeof(unsigned int) );
-
-    for ( i = 0; i < A->n; ++i )
-    {
-        L->start[i] = Ltop;
-        tmptop = 0;
-
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            j = A->j[pj];
-            val = A->val[pj];
-
-            if ( FABS(val) > droptol[i] )
-            {
-                k1 = 0;
-                k2 = L->start[j];
-                while ( k1 < tmptop && k2 < L->start[j + 1] )
-                {
-                    if ( tmp_j[k1] < L->j[k2] )
-                    {
-                        ++k1;
-                    }
-                    else if ( tmp_j[k1] > L->j[k2] )
-                    {
-                        ++k2;
-                    }
-                    else
-                    {
-                        val -= (tmp_val[k1++] * L->val[k2++]);
-                    }
-                }
-
-                // L matrix is lower triangular,
-                // so right before the start of next row comes jth diagonal
-                val /= L->val[L->start[j + 1] - 1];
-
-                tmp_j[tmptop] = j;
-                tmp_val[tmptop] = val;
-                ++tmptop;
-            }
-        }
-
-        // sanity check
-        if ( A->j[pj] != i )
-        {
-            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
-            exit( NUMERIC_BREAKDOWN );
-        }
-
-        // compute the ith diagonal in L
-        val = A->val[pj];
-        for ( k1 = 0; k1 < tmptop; ++k1 )
-        {
-            val -= (tmp_val[k1] * tmp_val[k1]);
-        }
-
-#if defined(DEBUG)
-        if ( val < 0.0 )
-        {
-            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
-            exit( NUMERIC_BREAKDOWN );
-
-        }
-#endif
-
-        tmp_j[tmptop] = i;
-        tmp_val[tmptop] = SQRT( val );
-
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for ( k1 = 0; k1 < tmptop; ++k1 )
-        {
-            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
-            {
-                L->j[Ltop] = tmp_j[k1];
-                L->val[Ltop] = tmp_val[k1];
-                U->start[tmp_j[k1] + 1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        }
-        // keep the diagonal in any case
-        L->j[Ltop] = tmp_j[k1];
-        L->val[Ltop] = tmp_val[k1];
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-
-    L->start[i] = Ltop;
-//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-    /* U = L^T (Cholesky factorization) */
-    Transpose( L, U );
-//    for ( i = 1; i <= U->n; ++i )
-//    {
-//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
-//    }
-//    for ( i = 0; i < L->n; ++i )
-//    {
-//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
-//        {
-//            j = L->j[pj];
-//            U->j[Utop[j]] = i;
-//            U->val[Utop[j]] = L->val[pj];
-//            Utop[j]++;
-//        }
-//    }
-
-//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-
-    sfree( tmp_val, "ICHOLT::tmp_val" );
-    sfree( tmp_j, "ICHOLT::tmp_j" );
-    sfree( Utop, "ICHOLT::Utop" );
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Fine-grained (parallel) incomplete Cholesky factorization
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp. */
-#if defined(TESTING)
-static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
-        sparse_matrix * const U_t, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD;
-    int *Utop;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
-    {
-        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D_inv, D) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
-        D[i] = 1. / D_inv[i];
-    }
-
-    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
-    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(D(A))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(guided) \
-        default(none) shared(DAD, D_inv, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.;
-    }
-
-    /* initial guesses for U^T,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
-    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
-#endif
-        for ( j = 0; j < A->start[A->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 0; k <= A->n; ++k )
-            {
-                if ( U_t->start[k] > j )
-                {
-                    x = U_t->start[k - 1];
-                    ei_x = U_t->start[k];
-                    break;
-                }
-            }
-            /* column bounds of current nonzero */
-            y = U_t->start[U_t->j[j]];
-            ei_y = U_t->start[U_t->j[j] + 1];
-
-            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
-            while ( U_t->j[x] < U_t->j[j] &&
-                    U_t->j[y] < U_t->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U_t->j[x] == U_t->j[y] )
-                {
-                    sum += (U_t->val[x] * U_t->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U_t->j[x] < U_t->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            sum = DAD->val[j] - sum;
-
-            /* diagonal entries */
-            if ( (k - 1) == U_t->j[j] )
-            {
-                /* sanity check */
-                if ( sum < ZERO )
-                {
-                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
-#if defined(DEBUG_FOCUS)
-                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
-                             k - 1, A->entries[j].j, A->entries[j].val );
-                    fprintf( stderr, "sum = %10.3f\n", sum);
-#endif
-                    exit(NUMERIC_BREAKDOWN);
-                }
-
-                U_t->val[j] = SQRT( sum );
-            }
-            /* non-diagonal entries */
-            else
-            {
-                U_t->val[j] = sum / U_t->val[ei_y - 1];
-            }
-        }
-    }
-
-    /* apply inverse transformation D^{-1}U^{T},
-     * since DAD \approx U^{T}U, so
-     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(guided) \
-        default(none) shared(D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-        {
-            U_t->val[pj] *= D_inv[i];
-        }
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
-#endif
-
-    /* transpose U^{T} and copy into U */
-    Transpose( U_t, U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-#endif
-
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ICHOL_PAR::D_inv" );
-    sfree( D, "ICHOL_PAR::D" );
-    sfree( Utop, "ICHOL_PAR::Utop" );
-
-    return Get_Timing_Info( start );
-}
-#endif
-
-
-/* Fine-grained (parallel) incomplete LU factorization
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp.
- *
- * A: symmetric, half-stored (lower triangular), CSR format
- * sweeps: number of loops over non-zeros for computation
- * L / U: factorized triangular matrices (A \approx LU), CSR format */
-static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D, D_inv) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
-        D[i] = 1.0 / D_inv[i];
-//        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
-    }
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.0;
-    }
-
-    /* initial guesses for L and U,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-    /* store U^T in CSR for row-wise access and tranpose later */
-    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-
-    /* L has unit diagonal, by convention */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) default(none) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        L->val[L->start[i + 1] - 1] = 1.0;
-    }
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero in L */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
-            while ( L->j[x] < L->j[j] &&
-                    L->j[y] < L->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( L->j[x] == L->j[y] )
-                {
-                    sum += (L->val[x] * U->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( L->j[x] < L->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            if ( j != ei_x - 1 )
-            {
-                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
-            }
-        }
-
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
-            while ( U->j[x] < U->j[j] &&
-                    U->j[y] < U->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U->j[x] == U->j[y] )
-                {
-                    sum += (L->val[y] * U->val[x]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U->j[x] < U->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            U->val[j] = DAD->val[j] - sum;
-        }
-    }
-
-    /* apply inverse transformation:
-     * since DAD \approx LU, then
-     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
-        {
-            L->val[pj] = D_inv[i] * L->val[pj];
-            /* currently storing U^T, so use row index instead of column index */
-            U->val[pj] = U->val[pj] * D_inv[i];
-        }
-    }
-
-    Transpose_I( U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
-    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-#endif
-
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ILU_PAR::D_inv" );
-    sfree( D, "ILU_PAR::D_inv" );
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Fine-grained (parallel) incomplete LU factorization with thresholding
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp.
- *
- * A: symmetric, half-stored (lower triangular), CSR format
- * droptol: row-wise tolerances used for dropping
- * sweeps: number of loops over non-zeros for computation
- * L / U: factorized triangular matrices (A \approx LU), CSR format */
-static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
-                      const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD, *L_temp, *U_temp;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
-            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D, D_inv) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
-        D[i] = 1.0 / D_inv[i];
-    }
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(D(A))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.0;
-    }
-
-    /* initial guesses for L and U,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-    /* store U^T in CSR for row-wise access and tranpose later */
-    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-
-    /* L has unit diagonal, by convention */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i) shared(L_temp)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
-    }
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero in L */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
-            while ( L_temp->j[x] < L_temp->j[j] &&
-                    L_temp->j[y] < L_temp->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( L_temp->j[x] == L_temp->j[y] )
-                {
-                    sum += (L_temp->val[x] * U_temp->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( L_temp->j[x] < L_temp->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            if ( j != ei_x - 1 )
-            {
-                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
-            }
-        }
-
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
-            while ( U_temp->j[x] < U_temp->j[j] &&
-                    U_temp->j[y] < U_temp->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U_temp->j[x] == U_temp->j[y] )
-                {
-                    sum += (L_temp->val[y] * U_temp->val[x]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U_temp->j[x] < U_temp->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            U_temp->val[j] = DAD->val[j] - sum;
-        }
-    }
-
-    /* apply inverse transformation:
-     * since DAD \approx LU, then
-     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
-        {
-            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
-            /* currently storing U^T, so use row index instead of column index */
-            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
-        }
-    }
-
-    /* apply the dropping rule */
-    Ltop = 0;
-    Utop = 0;
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        L->start[i] = Ltop;
-        U->start[i] = Utop;
-
-        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
-        {
-            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
-            {
-                L->j[Ltop] = L_temp->j[pj];
-                L->val[Ltop] = L_temp->val[pj];
-                ++Ltop;
-            }
-        }
-
-        /* diagonal */
-        L->j[Ltop] = L_temp->j[pj];
-        L->val[Ltop] = L_temp->val[pj];
-        ++Ltop;
-
-        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
-        {
-            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
-            {
-                U->j[Utop] = U_temp->j[pj];
-                U->val[Utop] = U_temp->val[pj];
-                ++Utop;
-            }
-        }
-
-        /* diagonal */
-        U->j[Utop] = U_temp->j[pj];
-        U->val[Utop] = U_temp->val[pj];
-        ++Utop;
-    }
-
-    L->start[i] = Ltop;
-    U->start[i] = Utop;
-
-    Transpose_I( U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
-    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
-#endif
-
-    Deallocate_Matrix( U_temp );
-    Deallocate_Matrix( L_temp );
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ILUT_PAR::D_inv" );
-    sfree( D, "ILUT_PAR::D_inv" );
-
-    return Get_Timing_Info( start );
-}
-
-
 static void Extrapolate_Charges_QEq( const reax_system * const system,
         const control_params * const control,
         simulation_data * const data, static_storage * const workspace )
@@ -1584,6 +202,17 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+            data->timing.cm_solver_pre_comp +=
+                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                        &workspace->H_app_inv );
+#else
+            fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -1591,18 +220,22 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
     }
 
 #if defined(DEBUG)
+#define SIZE (1000)
+    char fname[SIZE];
+
     if ( control->cm_solver_pre_comp_type != NONE_PC && 
             control->cm_solver_pre_comp_type != DIAG_PC )
     {
         fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
 
 #if defined(DEBUG_FOCUS)
-        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->L, fname, NULL );
-        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->U, fname, NULL );
 #endif
     }
+#undef SIZE
 #endif
 }
 
@@ -1831,22 +464,26 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 //    }
 //
 //#if defined(DEBUG)
+//#define SIZE (1000)
+//    char fname[SIZE];
+//
 //    if ( control->cm_solver_pre_comp_type != DIAG_PC )
 //    {
 //        fprintf( stderr, "condest = %f\n", condest(workspace->L) );
 //
 //#if defined(DEBUG_FOCUS)
-//        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+//        snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step );
 //        Print_Sparse_Matrix2( workspace->L, fname, NULL );
-//        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+//        snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step );
 //        Print_Sparse_Matrix2( workspace->U, fname, NULL );
 //
 //        fprintf( stderr, "icholt-" );
-//        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+//        snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step );
 //        Print_Sparse_Matrix2( workspace->L, fname, NULL );
 //        Print_Sparse_Matrix( U );
 //#endif
 //    }
+//#undef SIZE
 //#endif
 //}
 
@@ -1928,6 +565,17 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+            data->timing.cm_solver_pre_comp +=
+                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                        &workspace->H_app_inv );
+#else
+            fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -1937,18 +585,22 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
     Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
 
 #if defined(DEBUG)
+#define SIZE (1000)
+    char fname[SIZE];
+
     if ( control->cm_solver_pre_comp_type != NONE_PC && 
             control->cm_solver_pre_comp_type != DIAG_PC )
     {
         fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
 
 #if defined(DEBUG_FOCUS)
-        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->L, fname, NULL );
-        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->U, fname, NULL );
 #endif
     }
+#undef SIZE
 #endif
 }
 
@@ -2031,6 +683,17 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+            data->timing.cm_solver_pre_comp +=
+                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                        &workspace->H_app_inv );
+#else
+            fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            exit( INVALID_INPUT );
+#endif
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2041,24 +704,27 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
     Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
 
 #if defined(DEBUG)
+#define SIZE (1000)
+    char fname[SIZE];
+
     if ( control->cm_solver_pre_comp_type != NONE_PC || 
             control->cm_solver_pre_comp_type != DIAG_PC )
     {
         fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
 
 #if defined(DEBUG_FOCUS)
-        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->L, fname, NULL );
-        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step );
         Print_Sparse_Matrix2( workspace->U, fname, NULL );
 #endif
     }
+#undef SIZE
 #endif
 }
 
 
-/* Setup routines before computing the preconditioner for QEq
- */
+
 static void Setup_Preconditioner_QEq( const reax_system * const system,
         const control_params * const control,
         simulation_data * const data, static_storage * const workspace,
@@ -2194,6 +860,11 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
+                    &workspace->H_spar_patt );
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2341,6 +1012,11 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
+                    &workspace->H_spar_patt );
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2490,6 +1166,11 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
+                    &workspace->H_spar_patt );
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2509,7 +1190,8 @@ static void Calculate_Charges_QEq( const reax_system * const system,
     int i;
     real u, s_sum, t_sum;
 
-    s_sum = t_sum = 0.;
+    s_sum = 0.0;
+    t_sum = 0.0;
     for ( i = 0; i < system->N_cm; ++i )
     {
         s_sum += workspace->s[0][i];
@@ -2787,17 +1469,18 @@ void Compute_Charges( reax_system * const system, control_params * const control
         const reax_list * const far_nbrs, const output_controls * const out_control )
 {
 #if defined(DEBUG_FOCUS)
-    char fname[200];
+#define SIZE (200)
+    char fname[SIZE];
     FILE * fp;
 
     if ( data->step >= 100 )
     {
-        sprintf( fname, "s_%d_%s.out", data->step, control->sim_name );
+        snprintf( fname, SIZE + 11, "s_%d_%s.out", data->step, control->sim_name );
         fp = fopen( fname, "w" );
         Vector_Print( fp, NULL, workspace->s[0], system->N_cm );
         fclose( fp );
 
-        sprintf( fname, "t_%d_%s.out", data->step, control->sim_name );
+        snprintf( fname, SIZE + 11, "t_%d_%s.out", data->step, control->sim_name );
         fp = fopen( fname, "w" );
         Vector_Print( fp, NULL, workspace->t[0], system->N_cm );
         fclose( fp );
@@ -2827,19 +1510,20 @@ void Compute_Charges( reax_system * const system, control_params * const control
 #if defined(DEBUG_FOCUS)
     if ( data->step >= 100 )
     {
-        sprintf( fname, "H_%d_%s.out", data->step, control->sim_name );
+        snprintf( fname, SIZE + 11, "H_%d_%s.out", data->step, control->sim_name );
         Print_Sparse_Matrix2( workspace->H, fname, NULL );
 //        Print_Sparse_Matrix_Binary( workspace->H, fname );
 
-        sprintf( fname, "b_s_%d_%s.out", data->step, control->sim_name );
+        snprintf( fname, SIZE + 11, "b_s_%d_%s.out", data->step, control->sim_name );
         fp = fopen( fname, "w" );
         Vector_Print( fp, NULL, workspace->b_s, system->N_cm );
         fclose( fp );
 
-        sprintf( fname, "b_t_%d_%s.out", data->step, control->sim_name );
+        snprintf( fname, SIZE + 11, "b_t_%d_%s.out", data->step, control->sim_name );
         fp = fopen( fname, "w" );
         Vector_Print( fp, NULL, workspace->b_t, system->N_cm );
         fclose( fp );
     }
+#undef SIZE
 #endif
 }
diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c
index ead90358..4f5616ab 100644
--- a/sPuReMD/src/control.c
+++ b/sPuReMD/src/control.c
@@ -79,6 +79,7 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     control->cm_domain_sparsity = 1.0;
     control->cm_solver_pre_comp_type = ICHOLT_PC;
     control->cm_solver_pre_comp_sweeps = 3;
+    control->cm_solver_pre_comp_sai_thres = 0.1;
     control->cm_solver_pre_comp_refactor = 100;
     control->cm_solver_pre_comp_droptol = 0.01;
     control->cm_solver_pre_app_type = TRI_SOLVE_PA;
@@ -305,6 +306,11 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             ival = atoi( tmp[1] );
             control->cm_solver_pre_comp_sweeps = ival;
         }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_sai_thres") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_solver_pre_comp_sai_thres = val;
+        }
         else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 )
         {
             ival = atoi( tmp[1] );
diff --git a/sPuReMD/src/geo_tools.c b/sPuReMD/src/geo_tools.c
index 48eb0dbb..fcde21be 100644
--- a/sPuReMD/src/geo_tools.c
+++ b/sPuReMD/src/geo_tools.c
@@ -482,7 +482,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data,
                   (system->box.box_norms[2] * system->box.box_norms[1]) );
 
     /*open pdb and write header*/
-    sprintf( fname, "%s-%d.pdb", control->sim_name, data->step );
+    snprintf( fname, MAX_STR + 9, "%s-%d.pdb", control->sim_name, data->step );
     pdb = fopen(fname, "w");
     fprintf( pdb, PDB_CRYST1_FORMAT_O,
              "CRYST1",
@@ -498,7 +498,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data,
         p_atom = &(system->atoms[i]);
         strncpy(name, p_atom->name, 8);
         Trim_Spaces(name);
-        sprintf( line, PDB_ATOM_FORMAT_O,
+        snprintf( line, MAX_STR, PDB_ATOM_FORMAT_O,
                  "ATOM  ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ',
                  p_atom->x[0], p_atom->x[1], p_atom->x[2],
                  1.0, 0.0, "0", name, "  " );
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index d4175d49..c5728825 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -321,6 +321,8 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->H = NULL;
     workspace->H_sp = NULL;
     workspace->L = NULL;
+    workspace->H_spar_patt = NULL;
+    workspace->H_app_inv = NULL;
     workspace->U = NULL;
     workspace->Hdia_inv = NULL;
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
@@ -638,7 +640,8 @@ void Init_Lists( reax_system *system, control_params *control,
 void Init_Out_Controls( reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control )
 {
-    char temp[1000];
+#define TEMP_SIZE (1000)
+    char temp[TEMP_SIZE];
 
     /* Init trajectory file */
     if ( out_control->write_steps > 0 )
@@ -697,11 +700,11 @@ void Init_Out_Controls( reax_system *system, control_params *control,
     /* Init molecular analysis file */
     if ( control->molec_anal )
     {
-        sprintf( temp, "%s.mol", control->sim_name );
+        snprintf( temp, TEMP_SIZE + 4, "%s.mol", control->sim_name );
         out_control->mol = fopen( temp, "w" );
         if ( control->num_ignored )
         {
-            sprintf( temp, "%s.ign", control->sim_name );
+            snprintf( temp, TEMP_SIZE + 4, "%s.ign", control->sim_name );
             out_control->ign = fopen( temp, "w" );
         }
     }
@@ -858,6 +861,8 @@ void Init_Out_Controls( reax_system *system, control_params *control,
        fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
        exit( CANNOT_OPEN_FILE );
        }*/
+
+#undef TEMP_SIZE
 }
 
 
@@ -1001,6 +1006,11 @@ void Finalize_Workspace( reax_system *system, control_params *control,
         Deallocate_Matrix( workspace->L );
         Deallocate_Matrix( workspace->U );
     }
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        Deallocate_Matrix( workspace->H_spar_patt );
+        Deallocate_Matrix( workspace->H_app_inv );
+    }
 
     for ( i = 0; i < 5; ++i )
     {
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 2094d4f4..c6f56ed4 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -20,11 +20,26 @@
   ----------------------------------------------------------------------*/
 
 #include "lin_alg.h"
-
 #include "allocate.h"
 #include "tool_box.h"
 #include "vector.h"
 
+#include "print_utils.h"
+
+/* Intel MKL */
+#if defined(HAVE_LAPACKE_MKL)
+#include "mkl.h"
+/* reference LAPACK */
+#elif defined(HAVE_LAPACKE)
+#include "lapacke.h"
+#endif
+
+typedef struct
+{
+    unsigned int j;
+    real val;
+} sparse_matrix_entry;
+
 
 /* global to make OpenMP shared (Sparse_MatVec) */
 #ifdef _OPENMP
@@ -61,13 +76,1733 @@ sparse_matrix *H_p;
 real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
 
 
+#if defined(TEST_MAT)
+static sparse_matrix * create_test_mat( void )
+{
+    unsigned int i, n;
+    sparse_matrix *H_test;
+
+    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    //3x3, SPD, store lower half
+    i = 0;
+    n = 0;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 4.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 12.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = 37.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = -16.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = -43.;
+    ++i;
+    H_test->j[i] = 2;
+    H_test->val[i] = 98.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+
+    return H_test;
+}
+#endif
+
+
+/* Routine used with qsort for sorting nonzeros within a sparse matrix row
+ *
+ * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
+ */
+static int compare_matrix_entry(const void *v1, const void *v2)
+{
+    /* larger element has larger column index */
+    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
+}
+
+
+/* Routine used for sorting nonzeros within a sparse matrix row;
+ *  internally, a combination of qsort and manual sorting is utilized
+ *  (parallel calls to qsort when multithreading, rows mapped to threads)
+ *
+ * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
+ */
+void Sort_Matrix_Rows( sparse_matrix * const A )
+{
+    unsigned int i, j, si, ei;
+    sparse_matrix_entry *temp;
+
+#ifdef _OPENMP
+    //    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
+#endif
+    {
+        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
+        {
+            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        /* sort each row of A using column indices */
+#ifdef _OPENMP
+        //        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+            ei = A->start[i + 1];
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                (temp + j)->j = A->j[si + j];
+                (temp + j)->val = A->val[si + j];
+            }
+
+            /* polymorphic sort in standard C library using column indices */
+            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                A->j[si + j] = (temp + j)->j;
+                A->val[si + j] = (temp + j)->val;
+            }
+        }
+
+        sfree( temp, "Sort_Matrix_Rows::temp" );
+    }
+}
+
+
+/* Convert a symmetric, half-sored sparse matrix into
+ * a full-stored sparse matrix
+ *
+ * A: symmetric sparse matrix, lower half stored in CSR
+ * A_full: resultant full sparse matrix in CSR
+ *   If A_full is NULL, allocate space, otherwise do not
+ *
+ * Assumptions:
+ *   A has non-zero diagonals
+ *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
+static void compute_full_sparse_matrix( const sparse_matrix * const A,
+                                        sparse_matrix ** A_full )
+{
+    int count, i, pj;
+    sparse_matrix *A_t;
+
+    if ( *A_full == NULL )
+    {
+        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
+        {
+            fprintf( stderr, "not enough memory for full A. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+
+
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for full A. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
+    count = 0;
+    for ( i = 0; i < A->n; ++i )
+    {
+
+        if ((*A_full)->start == NULL)
+        {
+        }
+        (*A_full)->start[i] = count;
+
+        /* A: symmetric, lower triangular portion only stored */
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            (*A_full)->val[count] = A->val[pj];
+            (*A_full)->j[count] = A->j[pj];
+            ++count;
+        }
+        /* A^T: symmetric, upper triangular portion only stored;
+         * skip diagonal from A^T, as included from A above */
+        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            (*A_full)->val[count] = A_t->val[pj];
+            (*A_full)->j[count] = A_t->j[pj];
+            ++count;
+        }
+    }
+    (*A_full)->start[i] = count;
+
+    Deallocate_Matrix( A_t );
+}
+
+
+/* Setup routines for sparse approximate inverse preconditioner
+ *
+ * A: symmetric sparse matrix, lower half stored in CSR
+ * filter:
+ * A_spar_patt:
+ *
+ * Assumptions:
+ *   A has non-zero diagonals
+ *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
+void Setup_Sparsity_Pattern( const sparse_matrix * const A,
+                             const real filter, sparse_matrix ** A_spar_patt )
+{
+    int i, pj, size;
+    real min, max, threshold, val;
+
+    min = 0.0;
+    max = 0.0;
+
+    if ( *A_spar_patt == NULL )
+    {
+        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
+        {
+            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+    else if ( ((*A_spar_patt)->m) < (A->m) )
+    {
+        Deallocate_Matrix( *A_spar_patt );
+        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
+        {
+            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+
+    // find min and max element of the matrix
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            val = A->val[pj];
+            if ( pj == 0 )
+            {
+                min = val;
+                max = val;
+            }
+            else
+            {
+                if ( min > val )
+                {
+                    min = val;
+                }
+                if ( max < val )
+                {
+                    max = val;
+                }
+            }
+        }
+    }
+
+    threshold = min + (max - min) * (1.0 - filter);
+    // calculate the nnz of the sparsity pattern
+    //    for ( size = 0, i = 0; i < A->n; ++i )
+    //    {
+    //        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+    //        {
+    //            if ( threshold <= A->val[pj] )
+    //                size++;
+    //        }
+    //    }
+    //
+    //    if ( Allocate_Matrix( A_spar_patt, A->n, size ) == NULL )
+    //    {
+    //        fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
+    //        exit( INSUFFICIENT_MEMORY );
+    //    }
+
+    //(*A_spar_patt)->start[(*A_spar_patt)->n] = size;
+
+    // fill the sparsity pattern
+    for ( size = 0, i = 0; i < A->n; ++i )
+    {
+        (*A_spar_patt)->start[i] = size;
+
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            if ( ( A->val[pj] >= threshold )  || ( A->j[pj] == i ) )
+            {
+                (*A_spar_patt)->val[size] = A->val[pj];
+                (*A_spar_patt)->j[size] = A->j[pj];
+                size++;
+            }
+        }
+    }
+    (*A_spar_patt)->start[A->n] = size;
+}
+
+void Calculate_Droptol( const sparse_matrix * const A,
+                        real * const droptol, const real dtol )
+{
+    int i, j, k;
+    real val;
+#ifdef _OPENMP
+    static real *droptol_local;
+    unsigned int tid;
+#endif
+
+#ifdef _OPENMP
+    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
+#endif
+    {
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+
+        #pragma omp master
+        {
+            /* keep b_local for program duration to avoid allocate/free
+             * overhead per Sparse_MatVec call*/
+            if ( droptol_local == NULL )
+            {
+                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
+                {
+                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* init droptol to 0 */
+        for ( i = 0; i < A->n; ++i )
+        {
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] = 0.0;
+#else
+            droptol[i] = 0.0;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
+
+        /* calculate sqaure of the norm of each row */
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
+            {
+                j = A->j[k];
+                val = A->val[k];
+
+#ifdef _OPENMP
+                droptol_local[tid * A->n + i] += val * val;
+                droptol_local[tid * A->n + j] += val * val;
+#else
+                droptol[i] += val * val;
+                droptol[j] += val * val;
+#endif
+            }
+
+            // diagonal entry
+            val = A->val[k];
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] += val * val;
+#else
+            droptol[i] += val * val;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            droptol[i] = 0.0;
+            for ( k = 0; k < omp_get_num_threads(); ++k )
+            {
+                droptol[i] += droptol_local[k * A->n + i];
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* calculate local droptol for each row */
+        //fprintf( stderr, "droptol: " );
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            //fprintf( stderr, "%f-->", droptol[i] );
+            droptol[i] = SQRT( droptol[i] ) * dtol;
+            //fprintf( stderr, "%f  ", droptol[i] );
+        }
+        //fprintf( stderr, "\n" );
+    }
+}
+
+
+int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
+{
+    int i, pj;
+    int fillin;
+    real val;
+
+    fillin = 0;
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) private(i, pj, val) reduction(+: fillin)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                ++fillin;
+            }
+        }
+    }
+
+    return fillin + A->n;
+}
+
+
+#if defined(HAVE_SUPERLU_MT)
+real SuperLU_Factorize( const sparse_matrix * const A,
+                        sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, pj, count, *Ltop, *Utop, r;
+    sparse_matrix *A_t;
+    SuperMatrix A_S, AC_S, L_S, U_S;
+    NCformat *A_S_store;
+    SCPformat *L_S_store;
+    NCPformat *U_S_store;
+    superlumt_options_t superlumt_options;
+    pxgstrf_shared_t pxgstrf_shared;
+    pdgstrf_threadarg_t *pdgstrf_threadarg;
+    int_t nprocs;
+    fact_t fact;
+    trans_t trans;
+    yes_no_t refact, usepr;
+    real u, drop_tol;
+    real *a, *at;
+    int_t *asub, *atsub, *xa, *xat;
+    int_t *perm_c; /* column permutation vector */
+    int_t *perm_r; /* row permutations from partial pivoting */
+    void *work;
+    int_t info, lwork;
+    int_t permc_spec, panel_size, relax;
+    Gstat_t Gstat;
+    flops_t flopcnt;
+
+    /* Default parameters to control factorization. */
+#ifdef _OPENMP
+    //TODO: set as global parameter and use
+    #pragma omp parallel \
+    default(none) shared(nprocs)
+    {
+        #pragma omp master
+        {
+            /* SuperLU_MT spawns threads internally, so set and pass parameter */
+            nprocs = omp_get_num_threads();
+        }
+    }
+#else
+    nprocs = 1;
+#endif
+
+    //    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
+    fact = DOFACT; /* factor from scratch */
+    trans = NOTRANS;
+    refact = NO; /* first time factorization */
+    //TODO: add to control file and use the value there to set these
+    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
+    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
+    u = 1.0; /* diagonal pivoting threshold */
+    usepr = NO;
+    drop_tol = 0.0;
+    work = NULL;
+    lwork = 0;
+
+#if defined(DEBUG)
+    fprintf( stderr, "nprocs = %d\n", nprocs );
+    fprintf( stderr, "Panel size = %d\n", panel_size );
+    fprintf( stderr, "Relax = %d\n", relax );
+#endif
+
+    if ( !(perm_r = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_r[].");
+    }
+    if ( !(perm_c = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_c[].");
+    }
+    if ( !(superlumt_options.etree = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for etree[].");
+    }
+    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
+    }
+    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for part_super__h[].");
+    }
+    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
+            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
+            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
+            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
+    {
+        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
+    count = 0;
+    for ( i = 0; i < A->n; ++i )
+    {
+        xa[i] = count;
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            a[count] = A->entries[pj].val;
+            asub[count] = A->entries[pj].j;
+            ++count;
+        }
+        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            a[count] = A_t->entries[pj].val;
+            asub[count] = A_t->entries[pj].j;
+            ++count;
+        }
+    }
+    xa[i] = count;
+
+    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
+                         &at, &atsub, &xat );
+
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", asub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", a[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xa[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", atsub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", at[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xat[i] );
+    fprintf( stderr, "\n" );
+
+    A_S.Stype = SLU_NC; /* column-wise, no supernode */
+    A_S.Dtype = SLU_D; /* double-precision */
+    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
+    A_S.nrow = A->n;
+    A_S.ncol = A->n;
+    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    A_S_store = (NCformat *) A_S.Store;
+    A_S_store->nnz = 2 * A->start[A->n] - A->n;
+    A_S_store->nzval = at;
+    A_S_store->rowind = atsub;
+    A_S_store->colptr = xat;
+
+    /* ------------------------------------------------------------
+       Allocate storage and initialize statistics variables.
+       ------------------------------------------------------------*/
+    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
+    StatInit( A->n, nprocs, &Gstat );
+
+    /* ------------------------------------------------------------
+       Get column permutation vector perm_c[], according to permc_spec:
+       permc_spec = 0: natural ordering
+       permc_spec = 1: minimum degree ordering on structure of A'*A
+       permc_spec = 2: minimum degree ordering on structure of A'+A
+       permc_spec = 3: approximate minimum degree for unsymmetric matrices
+       ------------------------------------------------------------*/
+    permc_spec = 0;
+    get_perm_c( permc_spec, &A_S, perm_c );
+
+    /* ------------------------------------------------------------
+       Initialize the option structure superlumt_options using the
+       user-input parameters;
+       Apply perm_c to the columns of original A to form AC.
+       ------------------------------------------------------------*/
+    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
+                  u, usepr, drop_tol, perm_c, perm_r,
+                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
+
+    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    /* ------------------------------------------------------------
+       Compute the LU factorization of A.
+       The following routine will create nprocs threads.
+       ------------------------------------------------------------*/
+    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
+
+    fprintf( stderr, "INFO: %d\n", info );
+
+    flopcnt = 0;
+    for (i = 0; i < nprocs; ++i)
+    {
+        flopcnt += Gstat.procstat[i].fcops;
+    }
+    Gstat.ops[FACT] = flopcnt;
+
+#if defined(DEBUG)
+    printf("\n** Result of sparse LU **\n");
+    L_S_store = (SCPformat *) L_S.Store;
+    U_S_store = (NCPformat *) U_S.Store;
+    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
+    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
+    fflush( stdout );
+#endif
+
+    /* convert L and R from SuperLU formats to CSR */
+    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
+    memset( Utop, 0, (A->n + 1) * sizeof(int) );
+    memset( L->start, 0, (A->n + 1) * sizeof(int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(int) );
+
+    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
+    for ( i = 0; i < A->n; ++i )
+    {
+        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
+        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
+        //TODO: correct for SCPformat for L?
+        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
+        //        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
+        //        {
+        //            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
+        //        }
+        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
+        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            ++Utop[U_S_store->rowind[pj] + 1];
+            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
+        }
+    }
+    for ( i = 1; i <= A->n; ++i )
+    {
+        //        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
+        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
+        //        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
+        //        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
+    }
+    for ( i = 0; i < A->n; ++i )
+    {
+        //        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
+        //        {
+        //            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
+        //            L->entries[Ltop[r]].j = r;
+        //            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
+        //            ++Ltop[r];
+        //        }
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            r = U_S_store->rowind[pj];
+            U->entries[Utop[r]].j = i;
+            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
+            ++Utop[r];
+        }
+    }
+
+    /* ------------------------------------------------------------
+       Deallocate storage after factorization.
+       ------------------------------------------------------------*/
+    pxgstrf_finalize( &superlumt_options, &AC_S );
+    Deallocate_Matrix( A_t );
+    sfree( xa, "SuperLU_Factorize::xa" );
+    sfree( asub, "SuperLU_Factorize::asub" );
+    sfree( a, "SuperLU_Factorize::a" );
+    SUPERLU_FREE( perm_r );
+    SUPERLU_FREE( perm_c );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
+    SUPERLU_FREE( A_S.Store );
+    if ( lwork == 0 )
+    {
+        Destroy_SuperNode_SCP(&L_S);
+        Destroy_CompCol_NCP(&U_S);
+    }
+    else if ( lwork > 0 )
+    {
+        SUPERLU_FREE(work);
+    }
+    StatFree(&Gstat);
+
+    sfree( Utop, "SuperLU_Factorize::Utop" );
+    sfree( Ltop, "SuperLU_Factorize::Ltop" );
+
+    //TODO: return iters
+    return 0.;
+}
+#endif
+
+
+/* Diagonal (Jacobi) preconditioner computation */
+real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
+{
+    unsigned int i;
+    real start;
+
+    start = Get_Time( );
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) private(i)
+#endif
+    for ( i = 0; i < H->n; ++i )
+    {
+        if ( H->val[H->start[i + 1] - 1] != 0.0 )
+        {
+            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
+        }
+        else
+        {
+            Hdia_inv[i] = 1.0;
+        }
+    }
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Incomplete Cholesky factorization with dual thresholding */
+real ICHOLT( const sparse_matrix * const A, const real * const droptol,
+             sparse_matrix * const L, sparse_matrix * const U )
+{
+    int *tmp_j;
+    real *tmp_val;
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val, start;
+    unsigned int *Utop;
+
+    start = Get_Time( );
+
+    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
+            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
+            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( Utop, 0, A->n * sizeof(unsigned int) );
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            j = A->j[pj];
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                k1 = 0;
+                k2 = L->start[j];
+                while ( k1 < tmptop && k2 < L->start[j + 1] )
+                {
+                    if ( tmp_j[k1] < L->j[k2] )
+                    {
+                        ++k1;
+                    }
+                    else if ( tmp_j[k1] > L->j[k2] )
+                    {
+                        ++k2;
+                    }
+                    else
+                    {
+                        val -= (tmp_val[k1++] * L->val[k2++]);
+                    }
+                }
+
+                // L matrix is lower triangular,
+                // so right before the start of next row comes jth diagonal
+                val /= L->val[L->start[j + 1] - 1];
+
+                tmp_j[tmptop] = j;
+                tmp_val[tmptop] = val;
+                ++tmptop;
+            }
+        }
+
+        // sanity check
+        if ( A->j[pj] != i )
+        {
+            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
+            exit( NUMERIC_BREAKDOWN );
+        }
+
+        // compute the ith diagonal in L
+        val = A->val[pj];
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            val -= (tmp_val[k1] * tmp_val[k1]);
+        }
+
+#if defined(DEBUG)
+        if ( val < 0.0 )
+        {
+            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
+            exit( NUMERIC_BREAKDOWN );
+
+        }
+#endif
+
+        tmp_j[tmptop] = i;
+        tmp_val[tmptop] = SQRT( val );
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
+            {
+                L->j[Ltop] = tmp_j[k1];
+                L->val[Ltop] = tmp_val[k1];
+                U->start[tmp_j[k1] + 1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        }
+        // keep the diagonal in any case
+        L->j[Ltop] = tmp_j[k1];
+        L->val[Ltop] = tmp_val[k1];
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+    //    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    /* U = L^T (Cholesky factorization) */
+    Transpose( L, U );
+    //    for ( i = 1; i <= U->n; ++i )
+    //    {
+    //        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
+    //    }
+    //    for ( i = 0; i < L->n; ++i )
+    //    {
+    //        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
+    //        {
+    //            j = L->j[pj];
+    //            U->j[Utop[j]] = i;
+    //            U->val[Utop[j]] = L->val[pj];
+    //            Utop[j]++;
+    //        }
+    //    }
+
+    //    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+
+    sfree( tmp_val, "ICHOLT::tmp_val" );
+    sfree( tmp_j, "ICHOLT::tmp_j" );
+    sfree( Utop, "ICHOLT::Utop" );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete Cholesky factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp. */
+#if defined(TESTING)
+real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+                sparse_matrix * const U_t, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+    int *Utop;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(D_inv, D) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1. / D_inv[i];
+    }
+
+    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
+    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+    default(none) shared(DAD, D_inv, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.;
+    }
+
+    /* initial guesses for U^T,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
+    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
+#endif
+        for ( j = 0; j < A->start[A->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 0; k <= A->n; ++k )
+            {
+                if ( U_t->start[k] > j )
+                {
+                    x = U_t->start[k - 1];
+                    ei_x = U_t->start[k];
+                    break;
+                }
+            }
+            /* column bounds of current nonzero */
+            y = U_t->start[U_t->j[j]];
+            ei_y = U_t->start[U_t->j[j] + 1];
+
+            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
+            while ( U_t->j[x] < U_t->j[j] &&
+                    U_t->j[y] < U_t->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_t->j[x] == U_t->j[y] )
+                {
+                    sum += (U_t->val[x] * U_t->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_t->j[x] < U_t->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            sum = DAD->val[j] - sum;
+
+            /* diagonal entries */
+            if ( (k - 1) == U_t->j[j] )
+            {
+                /* sanity check */
+                if ( sum < ZERO )
+                {
+                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
+#if defined(DEBUG_FOCUS)
+                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
+                             k - 1, A->entries[j].j, A->entries[j].val );
+                    fprintf( stderr, "sum = %10.3f\n", sum);
+#endif
+                    exit(NUMERIC_BREAKDOWN);
+                }
+
+                U_t->val[j] = SQRT( sum );
+            }
+            /* non-diagonal entries */
+            else
+            {
+                U_t->val[j] = sum / U_t->val[ei_y - 1];
+            }
+        }
+    }
+
+    /* apply inverse transformation D^{-1}U^{T},
+     * since DAD \approx U^{T}U, so
+     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+    default(none) shared(D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            U_t->val[pj] *= D_inv[i];
+        }
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
+#endif
+
+    /* transpose U^{T} and copy into U */
+    Transpose( U_t, U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ICHOL_PAR::D_inv" );
+    sfree( D, "ICHOL_PAR::D" );
+    sfree( Utop, "ICHOL_PAR::Utop" );
+
+    return Get_Timing_Info( start );
+}
+#endif
+
+
+/* Fine-grained (parallel) incomplete LU factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+              sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+        //        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) default(none) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->val[L->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L->j[x] < L->j[j] &&
+                    L->j[y] < L->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L->j[x] == L->j[y] )
+                {
+                    sum += (L->val[x] * U->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L->j[x] < L->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U->j[x] < U->j[j] &&
+                    U->j[y] < U->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U->j[x] == U->j[y] )
+                {
+                    sum += (L->val[y] * U->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U->j[x] < U->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(DAD, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L->val[pj] = D_inv[i] * L->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U->val[pj] = U->val[pj] * D_inv[i];
+        }
+    }
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ILU_PAR::D_inv" );
+    sfree( D, "ILU_PAR::D_inv" );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete LU factorization with thresholding
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * droptol: row-wise tolerances used for dropping
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
+               const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD, *L_temp, *U_temp;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) private(i) shared(L_temp)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L_temp->j[x] < L_temp->j[j] &&
+                    L_temp->j[y] < L_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L_temp->j[x] == L_temp->j[y] )
+                {
+                    sum += (L_temp->val[x] * U_temp->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L_temp->j[x] < L_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U_temp->j[x] < U_temp->j[j] &&
+                    U_temp->j[y] < U_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_temp->j[x] == U_temp->j[y] )
+                {
+                    sum += (L_temp->val[y] * U_temp->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_temp->j[x] < U_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U_temp->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+    default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
+        }
+    }
+
+    /* apply the dropping rule */
+    Ltop = 0;
+    Utop = 0;
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        L->start[i] = Ltop;
+        U->start[i] = Utop;
+
+        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
+            {
+                L->j[Ltop] = L_temp->j[pj];
+                L->val[Ltop] = L_temp->val[pj];
+                ++Ltop;
+            }
+        }
+
+        /* diagonal */
+        L->j[Ltop] = L_temp->j[pj];
+        L->val[Ltop] = L_temp->val[pj];
+        ++Ltop;
+
+        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
+            {
+                U->j[Utop] = U_temp->j[pj];
+                U->val[Utop] = U_temp->val[pj];
+                ++Utop;
+            }
+        }
+
+        /* diagonal */
+        U->j[Utop] = U_temp->j[pj];
+        U->val[Utop] = U_temp->val[pj];
+        ++Utop;
+    }
+
+    L->start[i] = Ltop;
+    U->start[i] = Utop;
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
+    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
+#endif
+
+    Deallocate_Matrix( U_temp );
+    Deallocate_Matrix( L_temp );
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ILUT_PAR::D_inv" );
+    sfree( D, "ILUT_PAR::D_inv" );
+
+    return Get_Timing_Info( start );
+}
+
+
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+real Sparse_Approx_Inverse( const sparse_matrix * const A,
+                            const sparse_matrix * const A_spar_patt,
+                            sparse_matrix ** A_app_inv )
+{
+    int i, k, pj, j_temp, identity_pos;
+    int N, M, d_i, d_j;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *pos_x, *pos_y;
+    real start;
+    real *e_j, *dense_matrix;
+    sparse_matrix *A_full = NULL, *A_spar_patt_full = NULL;
+    char *X, *Y;
+
+    start = Get_Time( );
+
+    // Get A and A_spar_patt full matrices
+    compute_full_sparse_matrix( A, &A_full );
+    compute_full_sparse_matrix( A_spar_patt, &A_spar_patt_full );
+
+    // A_app_inv will be the same as A_spar_patt_full except the val array
+    if ( Allocate_Matrix( A_app_inv, A_spar_patt_full->n, A_spar_patt_full->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt_full->start[A_spar_patt_full->n];
+
+#ifdef _OPENMP
+    #pragma omp parallel default(none) \
+    shared(A_full, A_spar_patt_full) private(i, k, pj, j_temp, identity_pos, \
+            N, M, d_i, d_j, m, n, nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix)
+#endif
+    {
+        X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" );
+        Y = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::Y" );
+        pos_x = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_x" );
+        pos_y = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_y" );
+
+        for ( i = 0; i < A->n; ++i )
+        {
+            X[i] = 0;
+            Y[i] = 0;
+            pos_x[i] = 0;
+            pos_y[i] = 0;
+        }
+
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A_spar_patt_full->n; ++i )
+        {
+            N = 0;
+            M = 0;
+
+            // find column indices of nonzeros (which will be the columns indices of the dense matrix)
+            for ( pj = A_spar_patt_full->start[i]; pj < A_spar_patt_full->start[i + 1]; ++pj )
+            {
+
+                j_temp = A_spar_patt_full->j[pj];
+
+                Y[j_temp] = 1;
+                pos_y[j_temp] = N;
+                ++N;
+
+                // for each of those indices
+                // search through the row of full A of that index
+                for ( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k )
+                {
+                    // and accumulate the nonzero column indices to serve as the row indices of the dense matrix
+                    X[A_full->j[k]] = 1;
+                }
+            }
+
+            // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix
+            identity_pos = M;
+            for ( k = 0; k < A_full->n; k++)
+            {
+                if ( X[k] != 0 )
+                {
+                    pos_x[M] = k;
+                    if ( k == i )
+                    {
+                        identity_pos = M;
+                    }
+                    ++M;
+                }
+            }
+
+            // allocate memory for NxM dense matrix
+            dense_matrix = (real *) smalloc( sizeof(real) * N * M,
+                                             "Sparse_Approx_Inverse::dense_matrix" );
+
+            // fill in the entries of dense matrix
+            for ( d_i = 0; d_i < M; ++d_i)
+            {
+                // all rows are initialized to zero
+                for ( d_j = 0; d_j < N; ++d_j )
+                {
+                    dense_matrix[d_i * N + d_j] = 0.0;
+                }
+                // change the value if any of the column indices is seen
+                for ( d_j = A_full->start[pos_x[d_i]];
+                        d_j < A_full->start[pos_x[d_i + 1]]; ++d_j )
+                {
+                    if ( Y[A_full->j[d_j]] == 1 )
+                    {
+                        dense_matrix[d_i * N + pos_y[A_full->j[d_j]]] = A_full->val[d_j];
+                    }
+                }
+
+            }
+
+            /* create the right hand side of the linear equation
+               that is the full column of the identity matrix*/
+            e_j = (real *) smalloc( sizeof(real) * M,
+                                    "Sparse_Approx_Inverse::e_j" );
+
+            for ( k = 0; k < M; ++k )
+            {
+                e_j[k] = 0.0;
+            }
+            e_j[identity_pos] = 1.0;
+
+            /* Solve the overdetermined system AX = B through the least-squares problem:
+             * min ||B - AX||_2 */
+            m = M;
+            n = N;
+            nrhs = 1;
+            lda = N;
+            ldb = nrhs;
+            info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                                  e_j, ldb );
+
+            /* Check for the full rank */
+            if ( info > 0 )
+            {
+                fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
+                fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+                fprintf( stderr, "the least squares solution could not be computed.\n" );
+                exit( INVALID_INPUT );
+            }
+
+            /* Print least squares solution */
+            // print_matrix( "Least squares solution", n, nrhs, b, ldb );
+
+            // accumulate the resulting vector to build A_app_inv
+            (*A_app_inv)->start[i] = A_spar_patt_full->start[i];
+            for ( k = A_spar_patt_full->start[i]; k < A_spar_patt_full->start[i + 1]; ++k)
+            {
+                (*A_app_inv)->j[k] = A_spar_patt_full->j[k];
+                (*A_app_inv)->val[k] = e_j[k - A_spar_patt_full->start[i]];
+            }
+
+            //empty variables that will be used next iteration
+            sfree( dense_matrix, "Sparse_Approx_Inverse::dense_matrix" );
+            sfree( e_j, "Sparse_Approx_Inverse::e_j"  );
+            for ( k = 0; k < A->n; ++k )
+            {
+                X[k] = 0;
+                Y[k] = 0;
+                pos_x[k] = 0;
+                pos_y[k] = 0;
+            }
+        }
+
+        sfree( pos_y, "Sparse_Approx_Inverse::pos_y" );
+        sfree( pos_x, "Sparse_Approx_Inverse::pos_x" );
+        sfree( Y, "Sparse_Approx_Inverse::Y" );
+        sfree( X, "Sparse_Approx_Inverse::X" );
+    }
+
+    Deallocate_Matrix( A_full );
+    Deallocate_Matrix( A_spar_patt_full );
+
+    return Get_Timing_Info( start );
+}
+#endif
+
+
 /* sparse matrix-vector product Ax=b
  * where:
  *   A: lower triangular matrix, stored in CSR format
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec( const sparse_matrix * const A,
-        const real * const x, real * const b )
+                           const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
@@ -137,6 +1872,31 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 }
 
 
+/* sparse matrix-vector product Ax = b
+ * where:
+ *   A: matrix, stored in CSR format
+ *   x: vector
+ *   b: vector (result) */
+static void Sparse_MatVec_full( const sparse_matrix * const A,
+                                const real * const x, real * const b )
+{
+    int i, pj;
+
+    Vector_MakeZero( b, A->n );
+
+#ifdef _OPENMP
+    #pragma omp for schedule(static) default(none) private(i, j)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            b[i] += A->val[pj] * x[A->j[pj]];
+        }
+    }
+}
+
+
 /* Transpose A and copy into A^T
  *
  * A: stored in CSR
@@ -217,7 +1977,7 @@ void Transpose_I( sparse_matrix * const A )
  * N: dimensions of preconditioner and vectors (# rows in H)
  */
 static void diag_pre_app( const real * const Hdia_inv, const real * const y,
-        real * const x, const int N )
+                          real * const x, const int N )
 {
     unsigned int i;
 
@@ -243,7 +2003,7 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y,
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
 void tri_solve( const sparse_matrix * const LU, const real * const y,
-        real * const x, const int N, const TRIANGULARITY tri )
+                real * const x, const int N, const TRIANGULARITY tri )
 {
     int i, pj, j, si, ei;
     real val;
@@ -301,8 +2061,8 @@ void tri_solve( const sparse_matrix * const LU, const real * const y,
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
 void tri_solve_level_sched( const sparse_matrix * const LU,
-        const real * const y, real * const x, const int N,
-        const TRIANGULARITY tri, int find_levels )
+                            const real * const y, real * const x, const int N,
+                            const TRIANGULARITY tri, int find_levels )
 {
     int i, j, pj, local_row, local_level;
 
@@ -368,10 +2128,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                     ++level_rows_cnt[local_level];
                 }
 
-//#if defined(DEBUG)
+                //#if defined(DEBUG)
                 fprintf(stderr, "levels(L): %d\n", levels);
                 fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
-//#endif
+                //#endif
             }
             else
             {
@@ -388,10 +2148,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                     ++level_rows_cnt[local_level];
                 }
 
-//#if defined(DEBUG)
+                //#if defined(DEBUG)
                 fprintf(stderr, "levels(U): %d\n", levels);
                 fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
-//#endif
+                //#endif
             }
 
             for ( i = 1; i < levels + 1; ++i )
@@ -499,7 +2259,7 @@ static void compute_H_full( const sparse_matrix * const H )
             H_full->j[count] = H->j[pj];
             ++count;
         }
-        /* H^T: symmetric, upper triangular portion only stored; 
+        /* H^T: symmetric, upper triangular portion only stored;
          * skip diagonal from H^T, as included from H above */
         for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
         {
@@ -524,7 +2284,7 @@ static void compute_H_full( const sparse_matrix * const H )
  *
  * Reference:
  * Umit V. Catalyurek et al.
- * Graph Coloring Algorithms for Multi-core 
+ * Graph Coloring Algorithms for Multi-core
  *  and Massively Threaded Architectures
  * Parallel Computing, 2012
  */
@@ -691,15 +2451,15 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
         sfree( conflict_local, "graph_coloring::conflict_local" );
         sfree( fb_color, "graph_coloring::fb_color" );
 
-//#if defined(DEBUG)
-//#ifdef _OPENMP
-//    #pragma omp master
-//#endif
-//    {
-//        for ( i = 0; i < A->n; ++i )
-//            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
-//    }
-//#endif
+        //#if defined(DEBUG)
+        //#ifdef _OPENMP
+        //    #pragma omp master
+        //#endif
+        //    {
+        //        for ( i = 0; i < A->n; ++i )
+        //            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
+        //    }
+        //#endif
 
 #ifdef _OPENMP
         #pragma omp barrier
@@ -721,7 +2481,7 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri )
 
     /* sort vertices by color (ascending within a color)
      *  1) count colors
-     *  2) determine offsets of color ranges 
+     *  2) determine offsets of color ranges
      *  3) sort by color
      *
      *  note: color is 1-based */
@@ -756,7 +2516,7 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri )
  * tri: coloring to triangular factor to use (lower/upper)
  */
 static void permute_vector( real * const x, const unsigned int n, const int invert_map,
-       const TRIANGULARITY tri )
+                            const TRIANGULARITY tri )
 {
     unsigned int i;
 
@@ -958,7 +2718,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
 
         /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
         if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (to_color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL ||
                 (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
@@ -978,7 +2738,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
 
     graph_coloring( H_full, LOWER );
     sort_colors( H_full->n, LOWER );
-    
+
     memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
     memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
     memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
@@ -1000,8 +2760,8 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
  * Note: Newmann series arises from series expansion of the inverse of
  * the coefficient matrix in the triangular system */
 void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
-        const real * const b, real * const x, const TRIANGULARITY tri, const
-        unsigned int maxiter )
+                  const real * const b, real * const x, const TRIANGULARITY tri, const
+                  unsigned int maxiter )
 {
     unsigned int i, k, si = 0, ei = 0, iter;
 
@@ -1108,7 +2868,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
  *   Matrices have non-zero diagonals
  *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
 static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
-        const real * const y, real * const x, const int fresh_pre )
+                                  const real * const y, real * const x, const int fresh_pre )
 {
     int i, si;
 
@@ -1133,6 +2893,9 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
                 tri_solve( workspace->U, x, x, workspace->U->n, UPPER );
                 break;
+            case SAI_PC:
+                Sparse_MatVec_full( workspace->H_app_inv, y, x );
+                break;
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -1151,6 +2914,9 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                 tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
                 break;
+            case SAI_PC:
+                Sparse_MatVec_full( workspace->H_app_inv, y, x );
+                break;
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -1161,6 +2927,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
             switch ( control->cm_solver_pre_comp_type )
             {
             case DIAG_PC:
+            case SAI_PC:
                 fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
@@ -1170,14 +2937,14 @@ static void apply_preconditioner( const static_storage * const workspace, const
 #ifdef _OPENMP
                 #pragma omp single
 #endif
-                {
-                    memcpy( y_p, y, sizeof(real) * workspace->H->n );
-                }
+            {
+                memcpy( y_p, y, sizeof(real) * workspace->H->n );
+            }
 
-                permute_vector( y_p, workspace->H->n, FALSE, LOWER );
-                tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
-                tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
-                permute_vector( x, workspace->H->n, TRUE, UPPER );
+            permute_vector( y_p, workspace->H->n, FALSE, LOWER );
+            tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
+            permute_vector( x, workspace->H->n, TRUE, UPPER );
             break;
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
@@ -1189,6 +2956,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
             switch ( control->cm_solver_pre_comp_type )
             {
             case DIAG_PC:
+            case SAI_PC:
                 fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
@@ -1198,61 +2966,61 @@ static void apply_preconditioner( const static_storage * const workspace, const
 #ifdef _OPENMP
                 #pragma omp single
 #endif
+            {
+                if ( Dinv_L == NULL )
                 {
-                    if ( Dinv_L == NULL )
+                    if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
                     {
-                        if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
-                        {
-                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                            exit( INSUFFICIENT_MEMORY );
-                        }
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
                     }
                 }
+            }
 
                 /* construct D^{-1}_L */
-                if ( fresh_pre == TRUE )
-                {
+            if ( fresh_pre == TRUE )
+            {
 #ifdef _OPENMP
-                    #pragma omp for schedule(static)
+                #pragma omp for schedule(static)
 #endif
-                    for ( i = 0; i < workspace->L->n; ++i )
-                    {
-                        si = workspace->L->start[i + 1] - 1;
-                        Dinv_L[i] = 1. / workspace->L->val[si];
-                    }
+                for ( i = 0; i < workspace->L->n; ++i )
+                {
+                    si = workspace->L->start[i + 1] - 1;
+                    Dinv_L[i] = 1. / workspace->L->val[si];
                 }
+            }
 
-                jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+            jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
 
 #ifdef _OPENMP
-                #pragma omp single
+            #pragma omp single
 #endif
+            {
+                if ( Dinv_U == NULL )
                 {
-                    if ( Dinv_U == NULL )
+                    if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
                     {
-                        if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
-                        {
-                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                            exit( INSUFFICIENT_MEMORY );
-                        }
+                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
                     }
                 }
+            }
 
                 /* construct D^{-1}_U */
-                if ( fresh_pre == TRUE )
-                {
+            if ( fresh_pre == TRUE )
+            {
 #ifdef _OPENMP
-                    #pragma omp for schedule(static)
+                #pragma omp for schedule(static)
 #endif
-                    for ( i = 0; i < workspace->U->n; ++i )
-                    {
-                        si = workspace->U->start[i];
-                        Dinv_U[i] = 1. / workspace->U->val[si];
-                    }
+                for ( i = 0; i < workspace->U->n; ++i )
+                {
+                    si = workspace->U->start[i];
+                    Dinv_U[i] = 1. / workspace->U->val[si];
                 }
+            }
 
-                jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
-                break;
+            jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+            break;
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -1269,149 +3037,50 @@ static void apply_preconditioner( const static_storage * const workspace, const
 }
 
 
-/* generalized minimual residual iterative solver for sparse linear systems */
+/* generalized minimual residual method with restarting for sparse linear systems */
 int GMRES( const static_storage * const workspace, const control_params * const control,
-        simulation_data * const data, const sparse_matrix * const H, const real * const b,
-        const real tol, real * const x, const int fresh_pre )
+           simulation_data * const data, const sparse_matrix * const H, const real * const b,
+           const real tol, real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N, g_j, g_itr;
     real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
 
     N = H->n;
+    g_j = 0;
+    g_itr = 0;
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \
-        shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
+    shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
 #endif
     {
         j = 0;
         itr = 0;
 
-#ifdef _OPENMP
-        #pragma omp master
-#endif
-        {
-            time_start = Get_Time( );
-        }
+        time_start = Get_Time( );
         bnorm = Norm( b, N );
-#ifdef _OPENMP
-        #pragma omp master
-#endif
-        {
-            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-        }
-
-        if ( control->cm_solver_pre_comp_type == DIAG_PC )
-        {
-            /* apply preconditioner to residual */
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                time_start = Get_Time( );
-            }
-            apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre );
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
-            }
-        }
+        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 
         /* GMRES outer-loop */
         for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
         {
             /* calculate r0 */
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                time_start = Get_Time( );
-            }
+            time_start = Get_Time( );
             Sparse_MatVec( H, x, workspace->b_prm );
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
-            }
-
-            if ( control->cm_solver_pre_comp_type == DIAG_PC )
-            {
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE );
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
-                }
-            }
+            data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
 
-            if ( control->cm_solver_pre_comp_type == DIAG_PC )
-            {
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N );
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-                }
-            }
-            else
-            {
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N );
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-                }
-            }
+            time_start = Get_Time( );
+            Vector_Sum( workspace->b_prc, 1., b, -1., workspace->b_prm, N );
+            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 
-            if ( control->cm_solver_pre_comp_type != DIAG_PC )
-            {
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0],
-                        itr == 0 ? fresh_pre : FALSE );
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
-                }
-            }
+            time_start = Get_Time( );
+            apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[0],
+                                  itr == 0 ? fresh_pre : FALSE );
+            data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
 
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                time_start = Get_Time( );
-            }
+            time_start = Get_Time( );
             ret_temp = Norm( workspace->v[0], N );
+
 #ifdef _OPENMP
             #pragma omp single
 #endif
@@ -1419,91 +3088,54 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 workspace->g[0] = ret_temp;
             }
 
-            Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N );
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-            }
+            Vector_Scale( workspace->v[0], 1. / ret_temp, workspace->v[0], N );
+            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 
             /* GMRES inner-loop */
             for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ )
             {
                 /* matvec */
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] );
-
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
-                }
-
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
-                apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], FALSE );
+                time_start = Get_Time( );
+                Sparse_MatVec( H, workspace->v[j], workspace->b_prc );
+                data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
 
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
-                }
+                time_start = Get_Time( );
+                apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[j + 1], FALSE );
+                data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
 
 //                if ( control->cm_solver_pre_comp_type == DIAG_PC )
 //                {
-                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                /* apply modified Gram-Schmidt to orthogonalize the new residual */
+                time_start = Get_Time( );
+                for ( i = 0; i <= j; i++ )
+                {
+                    ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
+
 #ifdef _OPENMP
-                    #pragma omp master
+                    #pragma omp single
 #endif
                     {
-                        time_start = Get_Time( );
+                        workspace->h[i][j] = ret_temp;
                     }
-                    for ( i = 0; i <= j; i++ )
-                    {
-                        ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
 
-#ifdef _OPENMP
-                        #pragma omp single
-#endif
-                        {
-                            workspace->h[i][j] = ret_temp;
-                        }
+                    Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
 
-                        Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
-
-                    }
-#ifdef _OPENMP
-                    #pragma omp master
-#endif
-                    {
-                        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-                    }
+                }
+                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 //                }
 //                else
 //                {
 //                    //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
 //                    /* apply modified Gram-Schmidt to orthogonalize the new residual */
-//#ifdef _OPENMP
-//                    #pragma omp master
-//#endif
+//
+//
+//
 //                    {
 //                        time_start = Get_Time( );
 //                    }
-//#ifdef _OPENMP
+//
 //                    #pragma omp single
-//#endif
+//
 //                    {
 //                        for ( i = 0; i < j - 1; i++ )
 //                        {
@@ -1514,30 +3146,26 @@ int GMRES( const static_storage * const workspace, const control_params * const
 //                    for ( i = MAX(j - 1, 0); i <= j; i++ )
 //                    {
 //                        ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
-//#ifdef _OPENMP
+//
 //                        #pragma omp single
-//#endif
+//
 //                        {
 //                            workspace->h[i][j] = ret_temp;
 //                        }
 //
 //                        Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
 //                    }
-//#ifdef _OPENMP
-//                    #pragma omp master
-//#endif
+//
+//
+//
 //                    {
 //                        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 //                    }
 //                }
 
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
-                }
+                time_start = Get_Time( );
                 ret_temp = Norm( workspace->v[j + 1], N );
+
 #ifdef _OPENMP
                 #pragma omp single
 #endif
@@ -1546,41 +3174,35 @@ int GMRES( const static_storage * const workspace, const control_params * const
                 }
 
                 Vector_Scale( workspace->v[j + 1],
-                        1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N );
-
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-                }
+                              1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N );
+                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 
-#ifdef _OPENMP
-                #pragma omp master
-#endif
-                {
-                    time_start = Get_Time( );
+                time_start = Get_Time( );
 //                    if ( control->cm_solver_pre_comp_type == NONE_PC ||
 //                            control->cm_solver_pre_comp_type == DIAG_PC )
 //                    {
-                        /* Givens rotations on the upper-Hessenberg matrix to make it U */
-                        for ( i = 0; i <= j; i++ )
+                /* Givens rotations on the upper-Hessenberg matrix to make it U */
+#ifdef _OPENMP
+                #pragma omp single
+#endif
+                {
+                    for ( i = 0; i <= j; i++ )
+                    {
+                        if ( i == j )
                         {
-                            if ( i == j )
-                            {
-                                cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
-                                workspace->hc[j] = workspace->h[j][j] / cc;
-                                workspace->hs[j] = workspace->h[j + 1][j] / cc;
-                            }
-
-                            tmp1 =  workspace->hc[i] * workspace->h[i][j] +
+                            cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
+                            workspace->hc[j] = workspace->h[j][j] / cc;
+                            workspace->hs[j] = workspace->h[j + 1][j] / cc;
+                        }
+
+                        tmp1 =  workspace->hc[i] * workspace->h[i][j] +
                                 workspace->hs[i] * workspace->h[i + 1][j];
-                            tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                                workspace->hc[i] * workspace->h[i + 1][j];
+                        tmp2 = -workspace->hs[i] * workspace->h[i][j] +
+                               workspace->hc[i] * workspace->h[i + 1][j];
 
-                            workspace->h[i][j] = tmp1;
-                            workspace->h[i + 1][j] = tmp2;
-                        }
+                        workspace->h[i][j] = tmp1;
+                        workspace->h[i + 1][j] = tmp2;
+                    }
 //                    }
 //                    else
 //                    {
@@ -1611,8 +3233,8 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     workspace->g[j] = tmp1;
                     workspace->g[j + 1] = tmp2;
 
-                    data->timing.cm_solver_orthog += Get_Timing_Info( time_start );
                 }
+                data->timing.cm_solver_orthog += Get_Timing_Info( time_start );
 
 #ifdef _OPENMP
                 #pragma omp barrier
@@ -1620,12 +3242,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
             }
 
             /* solve Hy = g: H is now upper-triangular, do back-substitution */
+            time_start = Get_Time( );
 #ifdef _OPENMP
-            #pragma omp master
+            #pragma omp single
 #endif
             {
-                time_start = Get_Time( );
-
                 for ( i = j - 1; i >= 0; i-- )
                 {
                     temp = workspace->g[i];
@@ -1636,13 +3257,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
 
                     workspace->y[i] = temp / workspace->h[i][i];
                 }
-
-                data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start );
-
-                /* update x = x_0 + Vy */
-                time_start = Get_Time( );
             }
+            data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start );
 
+            /* update x = x_0 + Vy */
+            time_start = Get_Time( );
             Vector_MakeZero( workspace->p, N );
 
             for ( i = 0; i < j; i++ )
@@ -1651,13 +3270,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
             }
 
             Vector_Add( x, 1., workspace->p, N );
-
-#ifdef _OPENMP
-            #pragma omp master
-#endif
-            {
-                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
-            }
+            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
 
             /* stopping condition */
             if ( FABS(workspace->g[j]) / bnorm <= tol )
@@ -1667,11 +3280,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
         }
 
 #ifdef _OPENMP
-        #pragma omp master
+        #pragma omp single
 #endif
         {
-            g_itr = itr;
             g_j = j;
+            g_itr = itr;
         }
     }
 
@@ -1686,9 +3299,9 @@ int GMRES( const static_storage * const workspace, const control_params * const
 
 
 int GMRES_HouseHolder( const static_storage * const workspace,
-        const control_params * const control, simulation_data * const data,
-        const sparse_matrix * const H, const real * const b, real tol,
-        real * const x, const int fresh_pre )
+                       const control_params * const control, simulation_data * const data,
+                       const sparse_matrix * const H, const real * const b, real tol,
+                       real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1896,7 +3509,7 @@ int CG( const static_storage * const workspace, const control_params * const con
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \
-        shared(itr, N, d, r, p, z)
+    shared(itr, N, d, r, p, z)
 #endif
     {
         b_norm = Norm( b, N );
@@ -1948,8 +3561,8 @@ int CG( const static_storage * const workspace, const control_params * const con
 
 /* Steepest Descent */
 int SDM( const static_storage * const workspace, const control_params * const control,
-        const sparse_matrix * const H, const real * const b, const real tol,
-        real * const x, const int fresh_pre )
+         const sparse_matrix * const H, const real * const b, const real tol,
+         real * const x, const int fresh_pre )
 {
     int i, itr, N;
     real tmp, alpha, b_norm;
@@ -1959,13 +3572,13 @@ int SDM( const static_storage * const workspace, const control_params * const co
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \
-        shared(itr, N)
+    shared(itr, N)
 #endif
     {
         b_norm = Norm( b, N );
 
         Sparse_MatVec( H, x, workspace->q );
-        Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, N );
+        Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->q, N );
 
         apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre );
 
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 3946f60a..6caf7a1a 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -30,6 +30,41 @@ typedef enum
     UPPER = 1,
 } TRIANGULARITY;
 
+void Sort_Matrix_Rows( sparse_matrix * const );
+
+void Setup_Sparsity_Pattern( const sparse_matrix * const, 
+        const real, sparse_matrix ** );
+
+int Estimate_LU_Fill( const sparse_matrix * const, const real * const );
+
+void Calculate_Droptol( const sparse_matrix * const,
+        real * const, const real );
+
+#if defined(HAVE_SUPERLU_MT)
+real SuperLU_Factorize( const sparse_matrix * const,
+        sparse_matrix * const, sparse_matrix * const );
+#endif
+
+real diag_pre_comp( const sparse_matrix * const, real * const );
+
+real ICHOLT( const sparse_matrix * const, const real * const,
+        sparse_matrix * const, sparse_matrix * const );
+
+#if defined(TESTING)
+real ICHOL_PAR( const sparse_matrix * const, const unsigned int,
+        sparse_matrix * const, sparse_matrix * const );
+#endif
+
+real ILU_PAR( const sparse_matrix * const, const unsigned int,
+        sparse_matrix * const, sparse_matrix * const );
+
+real ILUT_PAR( const sparse_matrix * const, const real *,
+        const unsigned int, sparse_matrix * const, sparse_matrix * const );
+
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+real Sparse_Approx_Inverse( const sparse_matrix * const, const sparse_matrix * const,
+        sparse_matrix ** );
+#endif
 
 void Transpose( const sparse_matrix * const, sparse_matrix * const );
 
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index f75ad168..51f24d40 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -233,6 +233,7 @@ enum pre_comp
     ILU_PAR_PC = 3,
     ILUT_PAR_PC = 4,
     ILU_SUPERLU_MT_PC = 5,
+    SAI_PC = 6,
 };
 
 enum pre_app
@@ -606,6 +607,7 @@ typedef struct
     unsigned int cm_solver_pre_comp_refactor;
     real cm_solver_pre_comp_droptol;
     unsigned int cm_solver_pre_comp_sweeps;
+    real cm_solver_pre_comp_sai_thres;
     unsigned int cm_solver_pre_app_type;
     unsigned int cm_solver_pre_app_jacobi_iters;
 
@@ -889,6 +891,8 @@ typedef struct
     /* charge method storage */
     sparse_matrix *H;
     sparse_matrix *H_sp;
+    sparse_matrix *H_spar_patt;
+    sparse_matrix *H_app_inv;
     sparse_matrix *L;
     sparse_matrix *U;
     real *droptol;
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 1ef0a527..383fdb89 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -406,7 +406,7 @@ void Print_Near_Neighbors( reax_system *system, control_params *control,
     FILE *fout;
     reax_list *near_nbrs = &((*lists)[NEAR_NBRS]);
 
-    sprintf( fname, "%s.near_nbrs", control->sim_name );
+    snprintf( fname, MAX_STR, "%s.near_nbrs", control->sim_name );
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
@@ -440,7 +440,7 @@ void Print_Near_Neighbors2( reax_system *system, control_params *control,
     FILE *fout;
     reax_list *near_nbrs = &((*lists)[NEAR_NBRS]);
 
-    sprintf( fname, "%s.near_nbrs_lgj", control->sim_name );
+    snprintf( fname, MAX_STR, "%s.near_nbrs_lgj", control->sim_name );
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
@@ -475,7 +475,7 @@ void Print_Far_Neighbors( reax_system *system, control_params *control,
     FILE *fout;
     reax_list *far_nbrs = &((*lists)[FAR_NBRS]);
 
-    sprintf( fname, "%s.far_nbrs", control->sim_name );
+    snprintf( fname, MAX_STR, "%s.far_nbrs", control->sim_name );
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
@@ -520,7 +520,7 @@ void Print_Far_Neighbors2( reax_system *system, control_params *control,
     FILE *fout;
     reax_list *far_nbrs = &((*lists)[FAR_NBRS]);
 
-    sprintf( fname, "%s.far_nbrs_lgj", control->sim_name );
+    snprintf( fname, MAX_STR, "%s.far_nbrs_lgj", control->sim_name );
     fout = fopen( fname, "w" );
     int num = 0;
     int temp[500];
@@ -553,7 +553,7 @@ void Print_Total_Force( reax_system *system, control_params *control,
     int i;
 #if !defined(TEST_FORCES)
     char temp[1000];
-    sprintf( temp, "%s.ftot", control->sim_name );
+    snprintf( temp, 1000, "%s.ftot", control->sim_name );
     out_control->ftot = fopen( temp, "w" );
 #endif
 
@@ -735,7 +735,7 @@ void Print_Linear_System( reax_system *system, control_params *control,
     sparse_matrix *H;
     FILE *out;
 
-    sprintf( fname, "%s.state%d.out", control->sim_name, step );
+    snprintf( fname, 100, "%s.state%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
 
     for ( i = 0; i < system->N_cm; i++ )
@@ -747,13 +747,13 @@ void Print_Linear_System( reax_system *system, control_params *control,
                  workspace->t[0][i], workspace->b_t[i]  );
     fclose( out );
 
-    // sprintf( fname, "x2_%d", step );
+    // snprintf( fname, 100, "x2_%d", step );
     // out = fopen( fname, "w" );
     // for( i = 0; i < system->N; i++ )
     // fprintf( out, "%g\n", workspace->s_t[i+system->N] );
     // fclose( out );
 
-    sprintf( fname, "%s.H%d.out", control->sim_name, step );
+    snprintf( fname, 100, "%s.H%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
     H = workspace->H;
 
@@ -776,7 +776,7 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
     fclose( out );
 
-    sprintf( fname, "%s.H_sp%d.out", control->sim_name, step );
+    snprintf( fname, 100, "%s.H_sp%d.out", control->sim_name, step );
     out = fopen( fname, "w" );
     H = workspace->H_sp;
 
@@ -799,13 +799,13 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
     fclose( out );
 
-    /*sprintf( fname, "%s.b_s%d", control->sim_name, step );
+    /*snprintf( fname, 100, "%s.b_s%d", control->sim_name, step );
       out = fopen( fname, "w" );
       for( i = 0; i < system->N; i++ )
       fprintf( out, "%12.7f\n", workspace->b_s[i] );
       fclose( out );
 
-      sprintf( fname, "%s.b_t%d", control->sim_name, step );
+      snprintf( fname, 100, "%s.b_t%d", control->sim_name, step );
       out = fopen( fname, "w" );
       for( i = 0; i < system->N; i++ )
       fprintf( out, "%12.7f\n", workspace->b_t[i] );
@@ -820,7 +820,7 @@ void Print_Charges( reax_system *system, control_params *control,
     char fname[100];
     FILE *fout;
 
-    sprintf( fname, "%s.q%d", control->sim_name, step );
+    snprintf( fname, 100, "%s.q%d", control->sim_name, step );
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; ++i )
@@ -1002,7 +1002,7 @@ Print_XYZ_Serial( reax_system* system, static_storage *workspace )
     FILE *fout;
     int i;
 
-    sprintf( fname, "READ_PDB.0" );
+    snprintf( fname, 100, "READ_PDB.0" );
     fout = fopen( fname, "w" );
 
     for ( i = 0; i < system->N; i++ )
diff --git a/sPuReMD/src/restart.c b/sPuReMD/src/restart.c
index c15d51c5..0f7b1c5b 100644
--- a/sPuReMD/src/restart.c
+++ b/sPuReMD/src/restart.c
@@ -23,8 +23,9 @@
 #include "box.h"
 #include "vector.h"
 
+
 void Write_Binary_Restart( reax_system *system, control_params *control,
-                           simulation_data *data, static_storage *workspace )
+        simulation_data *data, static_storage *workspace )
 {
     int  i;
     char fname[MAX_STR];
@@ -33,7 +34,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
     restart_header res_header;
     restart_atom res_data;
 
-    sprintf( fname, "%s.res%d", control->sim_name, data->step );
+    snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step );
     fres = fopen( fname, "wb" );
 
     res_header.step = data->step;
@@ -139,14 +140,14 @@ void Read_Binary_Restart( char *fname, reax_system *system,
 
 
 void Write_ASCII_Restart( reax_system *system, control_params *control,
-                          simulation_data *data, static_storage *workspace )
+        simulation_data *data, static_storage *workspace )
 {
     int  i;
     char fname[MAX_STR];
     FILE *fres;
     reax_atom *p_atom;
 
-    sprintf( fname, "%s.res%d", control->sim_name, data->step );
+    snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step );
     fres = fopen( fname, "w" );
 
     fprintf( fres, RESTART_HEADER,
diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c
index bfdd2fcf..14e2a36b 100644
--- a/sPuReMD/src/traj.c
+++ b/sPuReMD/src/traj.c
@@ -31,13 +31,15 @@
 int Write_Custom_Header( reax_system *system, control_params *control,
         static_storage *workspace, output_controls *out_control )
 {
+#define SIZE1 (2048)
+#define SIZE2 (100)
     int i, header_len, control_block_len, frame_format_len;
     // char buffer[2048];
-    char control_block[2048];
-    char frame_format[2048];
-    char atom_format[100], bond_format[100], angle_format[100];
+    char control_block[SIZE1];
+    char frame_format[SIZE1];
+    char atom_format[SIZE2], bond_format[SIZE2], angle_format[SIZE2];
 
-    sprintf( control_block, CONTROL_BLOCK,
+    snprintf( control_block, SIZE1, CONTROL_BLOCK,
              system->N,
              control->restart,
              control->restart_from,
@@ -80,23 +82,23 @@ int Write_Custom_Header( reax_system *system, control_params *control,
     control_block_len = strlen( control_block );
 
 
-    sprintf( frame_format, "Frame Format: %d\n%s\n%s\n",
+    snprintf( frame_format, SIZE1, "Frame Format: %d\n%s\n%s\n",
              NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
 
     atom_format[0] = OPT_NOATOM;
     switch ( out_control->atom_format )
     {
     case OPT_ATOM_BASIC:
-        sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
+        snprintf( atom_format, SIZE2, "Atom_Basic: %s", ATOM_BASIC );
         break;
     case OPT_ATOM_wF:
-        sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
+        snprintf( atom_format, SIZE2, "Atom_wF: %s", ATOM_wF );
         break;
     case OPT_ATOM_wV:
-        sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
+        snprintf( atom_format, SIZE2, "Atom_wV: %s", ATOM_wV );
         break;
     case OPT_ATOM_FULL:
-        sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
+        snprintf( atom_format, SIZE2, "Atom_Full: %s", ATOM_FULL );
         break;
     default:
         break;
@@ -106,18 +108,18 @@ int Write_Custom_Header( reax_system *system, control_params *control,
     bond_format[0] = OPT_NOBOND;
     if ( out_control->bond_info == OPT_BOND_BASIC )
     {
-        sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
+        snprintf( bond_format, SIZE2, "Bond_Line: %s", BOND_BASIC );
     }
     else if ( out_control->bond_info == OPT_BOND_FULL )
     {
-        sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
+        snprintf( bond_format, SIZE2, "Bond_Line_Full: %s", BOND_FULL );
     }
     strcat( frame_format, bond_format );
 
     angle_format[0] = OPT_NOANGLE;
     if ( out_control->angle_info == OPT_ANGLE_BASIC )
     {
-        sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
+        snprintf( angle_format, SIZE2, "Angle_Line: %s", ANGLE_BASIC );
     }
     strcat( frame_format, angle_format );
 
@@ -158,6 +160,9 @@ int Write_Custom_Header( reax_system *system, control_params *control,
 
     fflush( out_control->trj );
 
+#undef SIZE2
+#undef SIZE1
+
     return 0;
 }
 
@@ -166,12 +171,13 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace,
         reax_list **lists, output_controls *out_control )
 {
+#define SIZE (2048)
     int i, j, pi, pk, pk_j;
     int write_atoms, write_bonds, write_angles;
     int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len;
     int frame_globals_len, num_bonds, num_thb_intrs;
     real P;
-    char buffer[2048];
+    char buffer[SIZE];
     reax_list *bonds = (*lists) + BONDS;
     reax_list *thb_intrs =  (*lists) + THREE_BODIES;
     bond_data *bo_ij;
@@ -290,7 +296,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     }
 
     /* calculate total frame length*/
-    sprintf( buffer, FRAME_GLOBALS,
+    snprintf( buffer, SIZE, FRAME_GLOBALS,
              data->step, data->time,
              data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
              P, system->box.volume,
@@ -476,6 +482,8 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
 
     fflush( out_control->trj );
 
+#undef SIZE
+
     return 0;
 }
 
-- 
GitLab