From 53be74a64e3f033bfa0d86655951f24816aec56b Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnk@msu.edu> Date: Wed, 17 Jan 2018 11:34:35 -0500 Subject: [PATCH] sPuReMD: implement sparse approximate inverse preconditioning. --- .gitignore | 10 +- ar-lib | 270 ----- depcomp | 791 ------------- environ/param.gpu.water | 1 + sPuReMD/configure.ac | 39 +- sPuReMD/src/analyze.c | 27 +- sPuReMD/src/charges.c | 1484 ++---------------------- sPuReMD/src/control.c | 6 + sPuReMD/src/geo_tools.c | 4 +- sPuReMD/src/init_md.c | 16 +- sPuReMD/src/lin_alg.c | 2235 +++++++++++++++++++++++++++++++------ sPuReMD/src/lin_alg.h | 35 + sPuReMD/src/mytypes.h | 4 + sPuReMD/src/print_utils.c | 26 +- sPuReMD/src/restart.c | 9 +- sPuReMD/src/traj.c | 36 +- 16 files changed, 2167 insertions(+), 2826 deletions(-) delete mode 100755 ar-lib delete mode 100755 depcomp diff --git a/.gitignore b/.gitignore index 0e82e4b2..71b2700c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,10 +33,12 @@ tags .~lock.* # Autotools -aclocal.m4 -autom4te.cache .deps .dirstamp +aclocal.m4 +autom4te.cache +ar-lib +compile confdefs.h config.log config.guess @@ -45,9 +47,13 @@ config.h.in config.status config.sub configure +depcomp +install-sh libtool +ltmain.sh Makefile Makefile.in +missing stamp-h1 test-driver *.lo diff --git a/ar-lib b/ar-lib deleted file mode 100755 index 463b9ec0..00000000 --- a/ar-lib +++ /dev/null @@ -1,270 +0,0 @@ -#! /bin/sh -# Wrapper for Microsoft lib.exe - -me=ar-lib -scriptversion=2012-03-01.08; # UTC - -# Copyright (C) 2010-2014 Free Software Foundation, Inc. -# Written by Peter Rosin <peda@lysator.liu.se>. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to <bug-automake@gnu.org> or send patches to -# <automake-patches@gnu.org>. - - -# func_error message -func_error () -{ - echo "$me: $1" 1>&2 - exit 1 -} - -file_conv= - -# func_file_conv build_file -# Convert a $build file to $host form and store it in $file -# Currently only supports Windows hosts. -func_file_conv () -{ - file=$1 - case $file in - / | /[!/]*) # absolute file, and not a UNC file - if test -z "$file_conv"; then - # lazily determine how to convert abs files - case `uname -s` in - MINGW*) - file_conv=mingw - ;; - CYGWIN*) - file_conv=cygwin - ;; - *) - file_conv=wine - ;; - esac - fi - case $file_conv in - mingw) - file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` - ;; - cygwin) - file=`cygpath -m "$file" || echo "$file"` - ;; - wine) - file=`winepath -w "$file" || echo "$file"` - ;; - esac - ;; - esac -} - -# func_at_file at_file operation archive -# Iterate over all members in AT_FILE performing OPERATION on ARCHIVE -# for each of them. -# When interpreting the content of the @FILE, do NOT use func_file_conv, -# since the user would need to supply preconverted file names to -# binutils ar, at least for MinGW. -func_at_file () -{ - operation=$2 - archive=$3 - at_file_contents=`cat "$1"` - eval set x "$at_file_contents" - shift - - for member - do - $AR -NOLOGO $operation:"$member" "$archive" || exit $? - done -} - -case $1 in - '') - func_error "no command. Try '$0 --help' for more information." - ;; - -h | --h*) - cat <<EOF -Usage: $me [--help] [--version] PROGRAM ACTION ARCHIVE [MEMBER...] - -Members may be specified in a file named with @FILE. -EOF - exit $? - ;; - -v | --v*) - echo "$me, version $scriptversion" - exit $? - ;; -esac - -if test $# -lt 3; then - func_error "you must specify a program, an action and an archive" -fi - -AR=$1 -shift -while : -do - if test $# -lt 2; then - func_error "you must specify a program, an action and an archive" - fi - case $1 in - -lib | -LIB \ - | -ltcg | -LTCG \ - | -machine* | -MACHINE* \ - | -subsystem* | -SUBSYSTEM* \ - | -verbose | -VERBOSE \ - | -wx* | -WX* ) - AR="$AR $1" - shift - ;; - *) - action=$1 - shift - break - ;; - esac -done -orig_archive=$1 -shift -func_file_conv "$orig_archive" -archive=$file - -# strip leading dash in $action -action=${action#-} - -delete= -extract= -list= -quick= -replace= -index= -create= - -while test -n "$action" -do - case $action in - d*) delete=yes ;; - x*) extract=yes ;; - t*) list=yes ;; - q*) quick=yes ;; - r*) replace=yes ;; - s*) index=yes ;; - S*) ;; # the index is always updated implicitly - c*) create=yes ;; - u*) ;; # TODO: don't ignore the update modifier - v*) ;; # TODO: don't ignore the verbose modifier - *) - func_error "unknown action specified" - ;; - esac - action=${action#?} -done - -case $delete$extract$list$quick$replace,$index in - yes,* | ,yes) - ;; - yesyes*) - func_error "more than one action specified" - ;; - *) - func_error "no action specified" - ;; -esac - -if test -n "$delete"; then - if test ! -f "$orig_archive"; then - func_error "archive not found" - fi - for member - do - case $1 in - @*) - func_at_file "${1#@}" -REMOVE "$archive" - ;; - *) - func_file_conv "$1" - $AR -NOLOGO -REMOVE:"$file" "$archive" || exit $? - ;; - esac - done - -elif test -n "$extract"; then - if test ! -f "$orig_archive"; then - func_error "archive not found" - fi - if test $# -gt 0; then - for member - do - case $1 in - @*) - func_at_file "${1#@}" -EXTRACT "$archive" - ;; - *) - func_file_conv "$1" - $AR -NOLOGO -EXTRACT:"$file" "$archive" || exit $? - ;; - esac - done - else - $AR -NOLOGO -LIST "$archive" | sed -e 's/\\/\\\\/g' | while read member - do - $AR -NOLOGO -EXTRACT:"$member" "$archive" || exit $? - done - fi - -elif test -n "$quick$replace"; then - if test ! -f "$orig_archive"; then - if test -z "$create"; then - echo "$me: creating $orig_archive" - fi - orig_archive= - else - orig_archive=$archive - fi - - for member - do - case $1 in - @*) - func_file_conv "${1#@}" - set x "$@" "@$file" - ;; - *) - func_file_conv "$1" - set x "$@" "$file" - ;; - esac - shift - shift - done - - if test -n "$orig_archive"; then - $AR -NOLOGO -OUT:"$archive" "$orig_archive" "$@" || exit $? - else - $AR -NOLOGO -OUT:"$archive" "$@" || exit $? - fi - -elif test -n "$list"; then - if test ! -f "$orig_archive"; then - func_error "archive not found" - fi - $AR -NOLOGO -LIST "$archive" || exit $? -fi diff --git a/depcomp b/depcomp deleted file mode 100755 index fc98710e..00000000 --- a/depcomp +++ /dev/null @@ -1,791 +0,0 @@ -#! /bin/sh -# depcomp - compile a program generating dependencies as side-effects - -scriptversion=2013-05-30.07; # UTC - -# Copyright (C) 1999-2014 Free Software Foundation, Inc. - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>. - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: depcomp [--help] [--version] PROGRAM [ARGS] - -Run PROGRAMS ARGS to compile a file, generating dependencies -as side-effects. - -Environment variables: - depmode Dependency tracking mode. - source Source file read by 'PROGRAMS ARGS'. - object Object file output by 'PROGRAMS ARGS'. - DEPDIR directory where to store dependencies. - depfile Dependency file to output. - tmpdepfile Temporary file to use when outputting dependencies. - libtool Whether libtool is used (yes/no). - -Report bugs to <bug-automake@gnu.org>. -EOF - exit $? - ;; - -v | --v*) - echo "depcomp $scriptversion" - exit $? - ;; -esac - -# Get the directory component of the given path, and save it in the -# global variables '$dir'. Note that this directory component will -# be either empty or ending with a '/' character. This is deliberate. -set_dir_from () -{ - case $1 in - */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;; - *) dir=;; - esac -} - -# Get the suffix-stripped basename of the given path, and save it the -# global variable '$base'. -set_base_from () -{ - base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'` -} - -# If no dependency file was actually created by the compiler invocation, -# we still have to create a dummy depfile, to avoid errors with the -# Makefile "include basename.Plo" scheme. -make_dummy_depfile () -{ - echo "#dummy" > "$depfile" -} - -# Factor out some common post-processing of the generated depfile. -# Requires the auxiliary global variable '$tmpdepfile' to be set. -aix_post_process_depfile () -{ - # If the compiler actually managed to produce a dependency file, - # post-process it. - if test -f "$tmpdepfile"; then - # Each line is of the form 'foo.o: dependency.h'. - # Do two passes, one to just change these to - # $object: dependency.h - # and one to simply output - # dependency.h: - # which is needed to avoid the deleted-header problem. - { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile" - sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile" - } > "$depfile" - rm -f "$tmpdepfile" - else - make_dummy_depfile - fi -} - -# A tabulation character. -tab=' ' -# A newline character. -nl=' -' -# Character ranges might be problematic outside the C locale. -# These definitions help. -upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ -lower=abcdefghijklmnopqrstuvwxyz -digits=0123456789 -alpha=${upper}${lower} - -if test -z "$depmode" || test -z "$source" || test -z "$object"; then - echo "depcomp: Variables source, object and depmode must be set" 1>&2 - exit 1 -fi - -# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. -depfile=${depfile-`echo "$object" | - sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} -tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} - -rm -f "$tmpdepfile" - -# Avoid interferences from the environment. -gccflag= dashmflag= - -# Some modes work just like other modes, but use different flags. We -# parameterize here, but still list the modes in the big case below, -# to make depend.m4 easier to write. Note that we *cannot* use a case -# here, because this file can only contain one case statement. -if test "$depmode" = hp; then - # HP compiler uses -M and no extra arg. - gccflag=-M - depmode=gcc -fi - -if test "$depmode" = dashXmstdout; then - # This is just like dashmstdout with a different argument. - dashmflag=-xM - depmode=dashmstdout -fi - -cygpath_u="cygpath -u -f -" -if test "$depmode" = msvcmsys; then - # This is just like msvisualcpp but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvisualcpp -fi - -if test "$depmode" = msvc7msys; then - # This is just like msvc7 but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvc7 -fi - -if test "$depmode" = xlc; then - # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information. - gccflag=-qmakedep=gcc,-MF - depmode=gcc -fi - -case "$depmode" in -gcc3) -## gcc 3 implements dependency tracking that does exactly what -## we want. Yay! Note: for some reason libtool 1.4 doesn't like -## it if -MD -MP comes after the -MF stuff. Hmm. -## Unfortunately, FreeBSD c89 acceptance of flags depends upon -## the command line argument order; so add the flags where they -## appear in depend2.am. Note that the slowdown incurred here -## affects only configure: in makefiles, %FASTDEP% shortcuts this. - for arg - do - case $arg in - -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;; - *) set fnord "$@" "$arg" ;; - esac - shift # fnord - shift # $arg - done - "$@" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - mv "$tmpdepfile" "$depfile" - ;; - -gcc) -## Note that this doesn't just cater to obsosete pre-3.x GCC compilers. -## but also to in-use compilers like IMB xlc/xlC and the HP C compiler. -## (see the conditional assignment to $gccflag above). -## There are various ways to get dependency output from gcc. Here's -## why we pick this rather obscure method: -## - Don't want to use -MD because we'd like the dependencies to end -## up in a subdir. Having to rename by hand is ugly. -## (We might end up doing this anyway to support other compilers.) -## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like -## -MM, not -M (despite what the docs say). Also, it might not be -## supported by the other compilers which use the 'gcc' depmode. -## - Using -M directly means running the compiler twice (even worse -## than renaming). - if test -z "$gccflag"; then - gccflag=-MD, - fi - "$@" -Wp,"$gccflag$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The second -e expression handles DOS-style file names with drive - # letters. - sed -e 's/^[^:]*: / /' \ - -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" -## This next piece of magic avoids the "deleted header file" problem. -## The problem is that when a header file which appears in a .P file -## is deleted, the dependency causes make to die (because there is -## typically no way to rebuild the header). We avoid this by adding -## dummy dependencies for each header file. Too bad gcc doesn't do -## this for us directly. -## Some versions of gcc put a space before the ':'. On the theory -## that the space means something, we add a space to the output as -## well. hp depmode also adds that space, but also prefixes the VPATH -## to the object. Take care to not repeat it in the output. -## Some versions of the HPUX 10.20 sed can't process this invocation -## correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -sgi) - if test "$libtool" = yes; then - "$@" "-Wp,-MDupdate,$tmpdepfile" - else - "$@" -MDupdate "$tmpdepfile" - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - - if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files - echo "$object : \\" > "$depfile" - # Clip off the initial element (the dependent). Don't try to be - # clever and replace this with sed code, as IRIX sed won't handle - # lines with more than a fixed number of characters (4096 in - # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; - # the IRIX cc adds comments like '#:fec' to the end of the - # dependency line. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \ - | tr "$nl" ' ' >> "$depfile" - echo >> "$depfile" - # The second pass generates a dummy entry for each header file. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ - >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" - ;; - -xlc) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -aix) - # The C for AIX Compiler uses -M and outputs the dependencies - # in a .u file. In older versions, this file always lives in the - # current directory. Also, the AIX compiler puts '$object:' at the - # start of each line; $object doesn't have directory information. - # Version 6 uses the directory in both cases. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.u - tmpdepfile2=$base.u - tmpdepfile3=$dir.libs/$base.u - "$@" -Wc,-M - else - tmpdepfile1=$dir$base.u - tmpdepfile2=$dir$base.u - tmpdepfile3=$dir$base.u - "$@" -M - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - aix_post_process_depfile - ;; - -tcc) - # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26 - # FIXME: That version still under development at the moment of writing. - # Make that this statement remains true also for stable, released - # versions. - # It will wrap lines (doesn't matter whether long or short) with a - # trailing '\', as in: - # - # foo.o : \ - # foo.c \ - # foo.h \ - # - # It will put a trailing '\' even on the last line, and will use leading - # spaces rather than leading tabs (at least since its commit 0394caf7 - # "Emit spaces for -MD"). - "$@" -MD -MF "$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'. - # We have to change lines of the first kind to '$object: \'. - sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile" - # And for each line of the second kind, we have to emit a 'dep.h:' - # dummy dependency, to avoid the deleted-header problem. - sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile" - rm -f "$tmpdepfile" - ;; - -## The order of this option in the case statement is important, since the -## shell code in configure will try each of these formats in the order -## listed in this file. A plain '-MD' option would be understood by many -## compilers, so we must ensure this comes after the gcc and icc options. -pgcc) - # Portland's C compiler understands '-MD'. - # Will always output deps to 'file.d' where file is the root name of the - # source file under compilation, even if file resides in a subdirectory. - # The object file name does not affect the name of the '.d' file. - # pgcc 10.2 will output - # foo.o: sub/foo.c sub/foo.h - # and will wrap long lines using '\' : - # foo.o: sub/foo.c ... \ - # sub/foo.h ... \ - # ... - set_dir_from "$object" - # Use the source, not the object, to determine the base name, since - # that's sadly what pgcc will do too. - set_base_from "$source" - tmpdepfile=$base.d - - # For projects that build the same source file twice into different object - # files, the pgcc approach of using the *source* file root name can cause - # problems in parallel builds. Use a locking strategy to avoid stomping on - # the same $tmpdepfile. - lockdir=$base.d-lock - trap " - echo '$0: caught signal, cleaning up...' >&2 - rmdir '$lockdir' - exit 1 - " 1 2 13 15 - numtries=100 - i=$numtries - while test $i -gt 0; do - # mkdir is a portable test-and-set. - if mkdir "$lockdir" 2>/dev/null; then - # This process acquired the lock. - "$@" -MD - stat=$? - # Release the lock. - rmdir "$lockdir" - break - else - # If the lock is being held by a different process, wait - # until the winning process is done or we timeout. - while test -d "$lockdir" && test $i -gt 0; do - sleep 1 - i=`expr $i - 1` - done - fi - i=`expr $i - 1` - done - trap - 1 2 13 15 - if test $i -le 0; then - echo "$0: failed to acquire lock after $numtries attempts" >&2 - echo "$0: check lockdir '$lockdir'" >&2 - exit 1 - fi - - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each line is of the form `foo.o: dependent.h', - # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. - # Do two passes, one to just change these to - # `$object: dependent.h' and one to simply `dependent.h:'. - sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp2) - # The "hp" stanza above does not work with aCC (C++) and HP's ia64 - # compilers, which have integrated preprocessors. The correct option - # to use with these is +Maked; it writes dependencies to a file named - # 'foo.d', which lands next to the object file, wherever that - # happens to be. - # Much of this is similar to the tru64 case; see comments there. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir.libs/$base.d - "$@" -Wc,+Maked - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - "$@" +Maked - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" - do - test -f "$tmpdepfile" && break - done - if test -f "$tmpdepfile"; then - sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile" - # Add 'dependent.h:' lines. - sed -ne '2,${ - s/^ *// - s/ \\*$// - s/$/:/ - p - }' "$tmpdepfile" >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" "$tmpdepfile2" - ;; - -tru64) - # The Tru64 compiler uses -MD to generate dependencies as a side - # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'. - # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put - # dependencies in 'foo.d' instead, so we check for that too. - # Subdirectories are respected. - set_dir_from "$object" - set_base_from "$object" - - if test "$libtool" = yes; then - # Libtool generates 2 separate objects for the 2 libraries. These - # two compilations output dependencies in $dir.libs/$base.o.d and - # in $dir$base.o.d. We have to check for both files, because - # one of the two compilations can be disabled. We should prefer - # $dir$base.o.d over $dir.libs/$base.o.d because the latter is - # automatically cleaned when .libs/ is deleted, while ignoring - # the former would cause a distcleancheck panic. - tmpdepfile1=$dir$base.o.d # libtool 1.5 - tmpdepfile2=$dir.libs/$base.o.d # Likewise. - tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504 - "$@" -Wc,-MD - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - tmpdepfile3=$dir$base.d - "$@" -MD - fi - - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - # Same post-processing that is required for AIX mode. - aix_post_process_depfile - ;; - -msvc7) - if test "$libtool" = yes; then - showIncludes=-Wc,-showIncludes - else - showIncludes=-showIncludes - fi - "$@" $showIncludes > "$tmpdepfile" - stat=$? - grep -v '^Note: including file: ' "$tmpdepfile" - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The first sed program below extracts the file names and escapes - # backslashes for cygpath. The second sed program outputs the file - # name when reading, but also accumulates all include files in the - # hold buffer in order to output them again at the end. This only - # works with sed implementations that can handle large buffers. - sed < "$tmpdepfile" -n ' -/^Note: including file: *\(.*\)/ { - s//\1/ - s/\\/\\\\/g - p -}' | $cygpath_u | sort -u | sed -n ' -s/ /\\ /g -s/\(.*\)/'"$tab"'\1 \\/p -s/.\(.*\) \\/\1:/ -H -$ { - s/.*/'"$tab"'/ - G - p -}' >> "$depfile" - echo >> "$depfile" # make sure the fragment doesn't end with a backslash - rm -f "$tmpdepfile" - ;; - -msvc7msys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -#nosideeffect) - # This comment above is used by automake to tell side-effect - # dependency tracking mechanisms from slower ones. - -dashmstdout) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout, regardless of -o. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - test -z "$dashmflag" && dashmflag=-M - # Require at least two characters before searching for ':' - # in the target name. This is to cope with DOS-style filenames: - # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise. - "$@" $dashmflag | - sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile" - rm -f "$depfile" - cat < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this sed invocation - # correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -dashXmstdout) - # This case only exists to satisfy depend.m4. It is never actually - # run, as this mode is specially recognized in the preamble. - exit 1 - ;; - -makedepend) - "$@" || exit $? - # Remove any Libtool call - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - # X makedepend - shift - cleared=no eat=no - for arg - do - case $cleared in - no) - set ""; shift - cleared=yes ;; - esac - if test $eat = yes; then - eat=no - continue - fi - case "$arg" in - -D*|-I*) - set fnord "$@" "$arg"; shift ;; - # Strip any option that makedepend may not understand. Remove - # the object too, otherwise makedepend will parse it as a source file. - -arch) - eat=yes ;; - -*|$object) - ;; - *) - set fnord "$@" "$arg"; shift ;; - esac - done - obj_suffix=`echo "$object" | sed 's/^.*\././'` - touch "$tmpdepfile" - ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" - rm -f "$depfile" - # makedepend may prepend the VPATH from the source file name to the object. - # No need to regex-escape $object, excess matching of '.' is harmless. - sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process the last invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed '1,2d' "$tmpdepfile" \ - | tr ' ' "$nl" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" "$tmpdepfile".bak - ;; - -cpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - "$@" -E \ - | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - | sed '$ s: \\$::' > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - cat < "$tmpdepfile" >> "$depfile" - sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvisualcpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - IFS=" " - for arg - do - case "$arg" in - -o) - shift - ;; - $object) - shift - ;; - "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") - set fnord "$@" - shift - shift - ;; - *) - set fnord "$@" "$arg" - shift - shift - ;; - esac - done - "$@" -E 2>/dev/null | - sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile" - echo "$tab" >> "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvcmsys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -none) - exec "$@" - ;; - -*) - echo "Unknown depmode $depmode" 1>&2 - exit 1 - ;; -esac - -exit 0 - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" -# time-stamp-end: "; # UTC" -# End: diff --git a/environ/param.gpu.water b/environ/param.gpu.water index eed406e7..2982676b 100644 --- a/environ/param.gpu.water +++ b/environ/param.gpu.water @@ -25,6 +25,7 @@ cm_solver_pre_comp_type 1 ! method used to compute precond cm_solver_pre_comp_refactor 1000 ! number of steps before recomputing preconditioner cm_solver_pre_comp_droptol 0.0 ! threshold tolerance for dropping values in preconditioner computation, if applicable cm_solver_pre_comp_sweeps 3 ! number of sweeps used to compute preconditioner (ILU_PAR) +cm_solver_pre_comp_sai_thres 0.1 ! ratio of charge matrix NNZ's used to compute preconditioner (SAI) cm_solver_pre_app_type 1 ! method used to apply preconditioner cm_solver_pre_app_jacobi_iters 50 ! number of Jacobi iterations used for applying precondition, if applicable diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac index 1a59944f..ce330dc6 100644 --- a/sPuReMD/configure.ac +++ b/sPuReMD/configure.ac @@ -65,10 +65,10 @@ if test "x${BUILD_OPENMP}" = "xyes"; then AC_OPENMP if test "x${OPENMP_CFLAGS}" = "x"; then AC_MSG_WARN([ - ----------------------------------------------- - Unable to find OpenMP support on this system. - Building a single-threaded version. - -----------------------------------------------]) + ----------------------------------------------- + Unable to find OpenMP support on this system. + Building a single-threaded version. + -----------------------------------------------]) else # bug due to recent Intel compiler change (?) if test "x${ax_cv_c_compiler_vendor}" = "xintel"; then @@ -130,6 +130,37 @@ then CFLAGS="${CFLAGS} ${GPROF_FLAGS}" fi +# Check for LAPACKE +AC_CHECK_HEADERS([mkl.h], [MKL_FOUND_HEADERS="yes"]) +if test "x${MKL_FOUND_HEADERS}" = "xyes" +then + AC_SEARCH_LIBS([LAPACKE_dgels], [mkl_intel_ilp64], + [MKL_FOUND_LIBS="yes"], [MKL_FOUND_LIBS="no"], + [-lmkl_sequential -lmkl_core -lpthread -lm -ldl]) + AS_IF([test "x${MKL_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find MKL LAPACKE library.])]) + LIBS="${LIBS} -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl" + AC_DEFINE([HAVE_LAPACKE_MKL], [1], [Define to 1 if you have MKL LAPACKE support enabled.]) +else + AC_CHECK_HEADERS([lapacke.h], [LAPACKE_FOUND_HEADERS="yes"]) + if test "x${LAPACKE_FOUND_HEADERS}" = "xyes" + then + AC_SEARCH_LIBS([LAPACKE_dgels], [lapacke], + [LAPACKE_FOUND_LIBS="yes"], [LAPACKE_FOUND_LIBS="no"], + [-llapack]) + AS_IF([test "x${LAPACKE_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find LAPACKE library.])]) + LIBS="${LIBS} -llapack" + AC_DEFINE([HAVE_LAPACKE], [1], [Define to 1 if you have LAPACKE support enabled.]) + else + AC_MSG_WARN([ + ----------------------------------------------- + Unable to find LAPACKE on this system. + Disabling support for dependent methods. + -----------------------------------------------]) + fi +fi + # Tests using Google C++ testing framework (gtest) AC_LANG_PUSH([C++]) AC_PROG_CXX([icpc g++ clang++ CC]) diff --git a/sPuReMD/src/analyze.c b/sPuReMD/src/analyze.c index 4e2cbe2c..9fcd54f7 100644 --- a/sPuReMD/src/analyze.c +++ b/sPuReMD/src/analyze.c @@ -191,7 +191,8 @@ void Get_Molecule( int atom, molecule *m, int *mark, reax_system *system, } -void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) +void Print_Molecule( reax_system *system, molecule *m, int mode, char *s, + size_t size ) { int j, atom; @@ -202,7 +203,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) /* print molecule summary */ for ( j = 0; j < MAX_ATOM_TYPES; ++j ) if ( m->mtypes[j] ) - sprintf( s, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] ); + snprintf( s, size, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] ); } else if ( mode == 2 ) { @@ -210,7 +211,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) for ( j = 0; j < m->atom_count; ++j ) { atom = m->atom_list[j]; - sprintf( s, "%s%s(%d)", + snprintf( s, size, "%s%s(%d)", s, system->reaxprm.sbp[ system->atoms[atom].type ].name, atom ); } } @@ -288,7 +289,8 @@ void Analyze_Molecules( reax_system *system, control_params *control, fprintf( fout, "old molecules: " ); for ( i = 0; i < num_old; ++i ) { - Print_Molecule( system, &old_molecules[i], 1, &s[0] ); + Print_Molecule( system, &old_molecules[i], 1, &s[0], + MAX_MOLECULE_SIZE * 10 ); fprintf( fout, "%s\t", s ); } fprintf( fout, "\n" ); @@ -296,7 +298,8 @@ void Analyze_Molecules( reax_system *system, control_params *control, fprintf( fout, "new molecules: " ); for ( i = 0; i < num_new; ++i ) { - Print_Molecule( system, &new_molecules[i], 1, &s[0] ); + Print_Molecule( system, &new_molecules[i], 1, &s[0], + MAX_MOLECULE_SIZE * 10 ); fprintf( fout, "%s\t", s ); } fprintf( fout, "\n" ); @@ -528,15 +531,15 @@ void Visit_Bonds( int atom, int *mark, int *type, reax_system *system, void Analyze_Fragments( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - reax_list **lists, FILE *fout, int ignore ) + simulation_data *data, static_storage *workspace, + reax_list **lists, FILE *fout, int ignore ) { - int atom, i, flag; - int *mark = workspace->mark; - int num_fragments, num_fragment_types; + int atom, i, flag; + int *mark = workspace->mark; + int num_fragments, num_fragment_types; char fragment[MAX_ATOM_TYPES]; char fragments[MAX_FRAGMENT_TYPES][MAX_ATOM_TYPES]; - int fragment_count[MAX_FRAGMENT_TYPES]; + int fragment_count[MAX_FRAGMENT_TYPES]; molecule m; reax_list *new_bonds = (*lists) + BONDS; //reax_list *old_bonds = (*lists) + OLD_BONDS; @@ -554,7 +557,7 @@ void Analyze_Fragments( reax_system *system, control_params *control, memset( m.mtypes, 0, MAX_ATOM_TYPES * sizeof(int) ); Visit_Bonds( atom, mark, m.mtypes, system, control, new_bonds, ignore ); ++num_fragments; - Print_Molecule( system, &m, 1, fragment ); + Print_Molecule( system, &m, 1, fragment, MAX_FRAGMENT_TYPES ); /* check if a similar fragment already exists */ flag = 0; diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index 5003d28a..0080f73c 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -32,1388 +32,6 @@ #endif -typedef struct -{ - unsigned int j; - real val; -} sparse_matrix_entry; - - -#if defined(TEST_MAT) -static sparse_matrix * create_test_mat( void ) -{ - unsigned int i, n; - sparse_matrix *H_test; - - if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE ) - { - fprintf( stderr, "not enough memory for test matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - //3x3, SPD, store lower half - i = 0; - n = 0; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = 4.; - ++i; - ++n; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = 12.; - ++i; - H_test->j[i] = 1; - H_test->val[i] = 37.; - ++i; - ++n; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = -16.; - ++i; - H_test->j[i] = 1; - H_test->val[i] = -43.; - ++i; - H_test->j[i] = 2; - H_test->val[i] = 98.; - ++i; - ++n; - H_test->start[n] = i; - - return H_test; -} -#endif - - -/* Routine used with qsort for sorting nonzeros within a sparse matrix row - * - * v1/v2: pointers to column indices of nonzeros within a row (unsigned int) - */ -static int compare_matrix_entry(const void *v1, const void *v2) -{ - /* larger element has larger column index */ - return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; -} - - -/* Routine used for sorting nonzeros within a sparse matrix row; - * internally, a combination of qsort and manual sorting is utilized - * (parallel calls to qsort when multithreading, rows mapped to threads) - * - * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format - */ -static void Sort_Matrix_Rows( sparse_matrix * const A ) -{ - unsigned int i, j, si, ei; - sparse_matrix_entry *temp; - -#ifdef _OPENMP -// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) -#endif - { - if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL ) - { - fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - /* sort each row of A using column indices */ -#ifdef _OPENMP -// #pragma omp for schedule(guided) -#endif - for ( i = 0; i < A->n; ++i ) - { - si = A->start[i]; - ei = A->start[i + 1]; - - for ( j = 0; j < (ei - si); ++j ) - { - (temp + j)->j = A->j[si + j]; - (temp + j)->val = A->val[si + j]; - } - - /* polymorphic sort in standard C library using column indices */ - qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); - - for ( j = 0; j < (ei - si); ++j ) - { - A->j[si + j] = (temp + j)->j; - A->val[si + j] = (temp + j)->val; - } - } - - sfree( temp, "Sort_Matrix_Rows::temp" ); - } -} - - -static void Calculate_Droptol( const sparse_matrix * const A, - real * const droptol, const real dtol ) -{ - int i, j, k; - real val; -#ifdef _OPENMP - static real *droptol_local; - unsigned int tid; -#endif - -#ifdef _OPENMP - #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) -#endif - { -#ifdef _OPENMP - tid = omp_get_thread_num(); - - #pragma omp master - { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ - if ( droptol_local == NULL ) - { - if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL ) - { - fprintf( stderr, "Not enough space for droptol. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - } - } - - #pragma omp barrier -#endif - - /* init droptol to 0 */ - for ( i = 0; i < A->n; ++i ) - { -#ifdef _OPENMP - droptol_local[tid * A->n + i] = 0.0; -#else - droptol[i] = 0.0; -#endif - } - -#ifdef _OPENMP - #pragma omp barrier -#endif - - /* calculate sqaure of the norm of each row */ -#ifdef _OPENMP - #pragma omp for schedule(static) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) - { - j = A->j[k]; - val = A->val[k]; - -#ifdef _OPENMP - droptol_local[tid * A->n + i] += val * val; - droptol_local[tid * A->n + j] += val * val; -#else - droptol[i] += val * val; - droptol[j] += val * val; -#endif - } - - // diagonal entry - val = A->val[k]; -#ifdef _OPENMP - droptol_local[tid * A->n + i] += val * val; -#else - droptol[i] += val * val; -#endif - } - -#ifdef _OPENMP - #pragma omp barrier - - #pragma omp for schedule(static) - for ( i = 0; i < A->n; ++i ) - { - droptol[i] = 0.0; - for ( k = 0; k < omp_get_num_threads(); ++k ) - { - droptol[i] += droptol_local[k * A->n + i]; - } - } - - #pragma omp barrier -#endif - - /* calculate local droptol for each row */ - //fprintf( stderr, "droptol: " ); -#ifdef _OPENMP - #pragma omp for schedule(static) -#endif - for ( i = 0; i < A->n; ++i ) - { - //fprintf( stderr, "%f-->", droptol[i] ); - droptol[i] = SQRT( droptol[i] ) * dtol; - //fprintf( stderr, "%f ", droptol[i] ); - } - //fprintf( stderr, "\n" ); - } -} - - -static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol ) -{ - int i, pj; - int fillin; - real val; - - fillin = 0; - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i, pj, val) reduction(+: fillin) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - val = A->val[pj]; - - if ( FABS(val) > droptol[i] ) - { - ++fillin; - } - } - } - - return fillin + A->n; -} - - -#if defined(HAVE_SUPERLU_MT) -static real SuperLU_Factorize( const sparse_matrix * const A, - sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, pj, count, *Ltop, *Utop, r; - sparse_matrix *A_t; - SuperMatrix A_S, AC_S, L_S, U_S; - NCformat *A_S_store; - SCPformat *L_S_store; - NCPformat *U_S_store; - superlumt_options_t superlumt_options; - pxgstrf_shared_t pxgstrf_shared; - pdgstrf_threadarg_t *pdgstrf_threadarg; - int_t nprocs; - fact_t fact; - trans_t trans; - yes_no_t refact, usepr; - real u, drop_tol; - real *a, *at; - int_t *asub, *atsub, *xa, *xat; - int_t *perm_c; /* column permutation vector */ - int_t *perm_r; /* row permutations from partial pivoting */ - void *work; - int_t info, lwork; - int_t permc_spec, panel_size, relax; - Gstat_t Gstat; - flops_t flopcnt; - - /* Default parameters to control factorization. */ -#ifdef _OPENMP - //TODO: set as global parameter and use - #pragma omp parallel \ - default(none) shared(nprocs) - { - #pragma omp master - { - /* SuperLU_MT spawns threads internally, so set and pass parameter */ - nprocs = omp_get_num_threads(); - } - } -#else - nprocs = 1; -#endif - -// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ - fact = DOFACT; /* factor from scratch */ - trans = NOTRANS; - refact = NO; /* first time factorization */ - //TODO: add to control file and use the value there to set these - panel_size = sp_ienv(1); /* # consec. cols treated as unit task */ - relax = sp_ienv(2); /* # cols grouped as relaxed supernode */ - u = 1.0; /* diagonal pivoting threshold */ - usepr = NO; - drop_tol = 0.0; - work = NULL; - lwork = 0; - -//#if defined(DEBUG) - fprintf( stderr, "nprocs = %d\n", nprocs ); - fprintf( stderr, "Panel size = %d\n", panel_size ); - fprintf( stderr, "Relax = %d\n", relax ); -//#endif - - if ( !(perm_r = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for perm_r[]."); - } - if ( !(perm_c = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for perm_c[]."); - } - if ( !(superlumt_options.etree = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for etree[]."); - } - if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for colcnt_h[]."); - } - if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for part_super__h[]."); - } - if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL ) - || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) - || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) - || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) - || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) - { - fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - /* Set up the sparse matrix data structure for A. */ - Transpose( A, A_t ); - - count = 0; - for ( i = 0; i < A->n; ++i ) - { - xa[i] = count; - for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - { - a[count] = A->entries[pj].val; - asub[count] = A->entries[pj].j; - ++count; - } - for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) - { - a[count] = A_t->entries[pj].val; - asub[count] = A_t->entries[pj].j; - ++count; - } - } - xa[i] = count; - - dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, - &at, &atsub, &xat ); - - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6d", asub[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6.1f", a[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i <= A->n; ++i ) - fprintf( stderr, "%6d", xa[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6d", atsub[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6.1f", at[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i <= A->n; ++i ) - fprintf( stderr, "%6d", xat[i] ); - fprintf( stderr, "\n" ); - - A_S.Stype = SLU_NC; /* column-wise, no supernode */ - A_S.Dtype = SLU_D; /* double-precision */ - A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */ - A_S.nrow = A->n; - A_S.ncol = A->n; - A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - A_S_store = (NCformat *) A_S.Store; - A_S_store->nnz = 2 * A->start[A->n] - A->n; - A_S_store->nzval = at; - A_S_store->rowind = atsub; - A_S_store->colptr = xat; - - /* ------------------------------------------------------------ - Allocate storage and initialize statistics variables. - ------------------------------------------------------------*/ - StatAlloc( A->n, nprocs, panel_size, relax, &Gstat ); - StatInit( A->n, nprocs, &Gstat ); - - /* ------------------------------------------------------------ - Get column permutation vector perm_c[], according to permc_spec: - permc_spec = 0: natural ordering - permc_spec = 1: minimum degree ordering on structure of A'*A - permc_spec = 2: minimum degree ordering on structure of A'+A - permc_spec = 3: approximate minimum degree for unsymmetric matrices - ------------------------------------------------------------*/ - permc_spec = 0; - get_perm_c( permc_spec, &A_S, perm_c ); - - /* ------------------------------------------------------------ - Initialize the option structure superlumt_options using the - user-input parameters; - Apply perm_c to the columns of original A to form AC. - ------------------------------------------------------------*/ - pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, - u, usepr, drop_tol, perm_c, perm_r, - work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); - - for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); - fprintf( stderr, "\n" ); - - /* ------------------------------------------------------------ - Compute the LU factorization of A. - The following routine will create nprocs threads. - ------------------------------------------------------------*/ - pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info ); - - fprintf( stderr, "INFO: %d\n", info ); - - flopcnt = 0; - for (i = 0; i < nprocs; ++i) - { - flopcnt += Gstat.procstat[i].fcops; - } - Gstat.ops[FACT] = flopcnt; - -//#if defined(DEBUG) - printf("\n** Result of sparse LU **\n"); - L_S_store = (SCPformat *) L_S.Store; - U_S_store = (NCPformat *) U_S.Store; - printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); - printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); - fflush( stdout ); -//#endif - - /* convert L and R from SuperLU formats to CSR */ - memset( Ltop, 0, (A->n + 1) * sizeof(int) ); - memset( Utop, 0, (A->n + 1) * sizeof(int) ); - memset( L->start, 0, (A->n + 1) * sizeof(int) ); - memset( U->start, 0, (A->n + 1) * sizeof(int) ); - - for ( i = 0; i < 2 * L_S_store->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < 2 * U_S_store->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] ); - fprintf( stderr, "\n" ); - - printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper ); - for ( i = 0; i < A->n; ++i ) - { - fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] ); - fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); - //TODO: correct for SCPformat for L? - //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) -// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) -// { -// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; -// } - fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); - fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); - for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) - { - ++Utop[U_S_store->rowind[pj] + 1]; - fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] ); - } - } - for ( i = 1; i <= A->n; ++i ) - { -// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; - Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; -// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); -// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); - } - for ( i = 0; i < A->n; ++i ) - { -// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) -// { -// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; -// L->entries[Ltop[r]].j = r; -// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; -// ++Ltop[r]; -// } - for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) - { - r = U_S_store->rowind[pj]; - U->entries[Utop[r]].j = i; - U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj]; - ++Utop[r]; - } - } - - /* ------------------------------------------------------------ - Deallocate storage after factorization. - ------------------------------------------------------------*/ - pxgstrf_finalize( &superlumt_options, &AC_S ); - Deallocate_Matrix( A_t ); - sfree( xa, "SuperLU_Factorize::xa" ); - sfree( asub, "SuperLU_Factorize::asub" ); - sfree( a, "SuperLU_Factorize::a" ); - SUPERLU_FREE( perm_r ); - SUPERLU_FREE( perm_c ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval ); - SUPERLU_FREE( A_S.Store ); - if ( lwork == 0 ) - { - Destroy_SuperNode_SCP(&L_S); - Destroy_CompCol_NCP(&U_S); - } - else if ( lwork > 0 ) - { - SUPERLU_FREE(work); - } - StatFree(&Gstat); - - sfree( Utop, "SuperLU_Factorize::Utop" ); - sfree( Ltop, "SuperLU_Factorize::Ltop" ); - - //TODO: return iters - return 0.; -} -#endif - - -/* Diagonal (Jacobi) preconditioner computation */ -static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) -{ - unsigned int i; - real start; - - start = Get_Time( ); - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) -#endif - for ( i = 0; i < H->n; ++i ) - { - if ( H->val[H->start[i + 1] - 1] != 0.0 ) - { - Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1]; - } - else - { - Hdia_inv[i] = 1.0; - } - } - - return Get_Timing_Info( start ); -} - - -/* Incomplete Cholesky factorization with dual thresholding */ -static real ICHOLT( const sparse_matrix * const A, const real * const droptol, - sparse_matrix * const L, sparse_matrix * const U ) -{ - int *tmp_j; - real *tmp_val; - int i, j, pj, k1, k2, tmptop, Ltop; - real val, start; - unsigned int *Utop; - - start = Get_Time( ); - - if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL || - ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL || - ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - // clear variables - Ltop = 0; - tmptop = 0; - memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) ); - memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) ); - memset( Utop, 0, A->n * sizeof(unsigned int) ); - - for ( i = 0; i < A->n; ++i ) - { - L->start[i] = Ltop; - tmptop = 0; - - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - j = A->j[pj]; - val = A->val[pj]; - - if ( FABS(val) > droptol[i] ) - { - k1 = 0; - k2 = L->start[j]; - while ( k1 < tmptop && k2 < L->start[j + 1] ) - { - if ( tmp_j[k1] < L->j[k2] ) - { - ++k1; - } - else if ( tmp_j[k1] > L->j[k2] ) - { - ++k2; - } - else - { - val -= (tmp_val[k1++] * L->val[k2++]); - } - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->val[L->start[j + 1] - 1]; - - tmp_j[tmptop] = j; - tmp_val[tmptop] = val; - ++tmptop; - } - } - - // sanity check - if ( A->j[pj] != i ) - { - fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i ); - exit( NUMERIC_BREAKDOWN ); - } - - // compute the ith diagonal in L - val = A->val[pj]; - for ( k1 = 0; k1 < tmptop; ++k1 ) - { - val -= (tmp_val[k1] * tmp_val[k1]); - } - -#if defined(DEBUG) - if ( val < 0.0 ) - { - fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i ); - exit( NUMERIC_BREAKDOWN ); - - } -#endif - - tmp_j[tmptop] = i; - tmp_val[tmptop] = SQRT( val ); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for ( k1 = 0; k1 < tmptop; ++k1 ) - { - if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) - { - L->j[Ltop] = tmp_j[k1]; - L->val[Ltop] = tmp_val[k1]; - U->start[tmp_j[k1] + 1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - } - // keep the diagonal in any case - L->j[Ltop] = tmp_j[k1]; - L->val[Ltop] = tmp_val[k1]; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; -// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - /* U = L^T (Cholesky factorization) */ - Transpose( L, U ); -// for ( i = 1; i <= U->n; ++i ) -// { -// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; -// } -// for ( i = 0; i < L->n; ++i ) -// { -// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) -// { -// j = L->j[pj]; -// U->j[Utop[j]] = i; -// U->val[Utop[j]] = L->val[pj]; -// Utop[j]++; -// } -// } - -// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); - - sfree( tmp_val, "ICHOLT::tmp_val" ); - sfree( tmp_j, "ICHOLT::tmp_j" ); - sfree( Utop, "ICHOLT::Utop" ); - - return Get_Timing_Info( start ); -} - - -/* Fine-grained (parallel) incomplete Cholesky factorization - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. */ -#if defined(TESTING) -static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const U_t, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; - real *D, *D_inv, sum, start; - sparse_matrix *DAD; - int *Utop; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) - { - fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D_inv, D) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); - D[i] = 1. / D_inv[i]; - } - - memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) ); - memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) ); - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(D(A))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D_inv, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.; - } - - /* initial guesses for U^T, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) ); - memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) ); - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) -#endif - for ( j = 0; j < A->start[A->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 0; k <= A->n; ++k ) - { - if ( U_t->start[k] > j ) - { - x = U_t->start[k - 1]; - ei_x = U_t->start[k]; - break; - } - } - /* column bounds of current nonzero */ - y = U_t->start[U_t->j[j]]; - ei_y = U_t->start[U_t->j[j] + 1]; - - /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */ - while ( U_t->j[x] < U_t->j[j] && - U_t->j[y] < U_t->j[j] && - x < ei_x && y < ei_y ) - { - if ( U_t->j[x] == U_t->j[y] ) - { - sum += (U_t->val[x] * U_t->val[y]); - ++x; - ++y; - } - else if ( U_t->j[x] < U_t->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - sum = DAD->val[j] - sum; - - /* diagonal entries */ - if ( (k - 1) == U_t->j[j] ) - { - /* sanity check */ - if ( sum < ZERO ) - { - fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "A(%5d,%5d) = %10.3f\n", - k - 1, A->entries[j].j, A->entries[j].val ); - fprintf( stderr, "sum = %10.3f\n", sum); -#endif - exit(NUMERIC_BREAKDOWN); - } - - U_t->val[j] = SQRT( sum ); - } - /* non-diagonal entries */ - else - { - U_t->val[j] = sum / U_t->val[ei_y - 1]; - } - } - } - - /* apply inverse transformation D^{-1}U^{T}, - * since DAD \approx U^{T}U, so - * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(D_inv) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - { - U_t->val[pj] *= D_inv[i]; - } - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 ); -#endif - - /* transpose U^{T} and copy into U */ - Transpose( U_t, U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -#endif - - Deallocate_Matrix( DAD ); - sfree( D_inv, "ICHOL_PAR::D_inv" ); - sfree( D, "ICHOL_PAR::D" ); - sfree( Utop, "ICHOL_PAR::Utop" ); - - return Get_Timing_Info( start ); -} -#endif - - -/* Fine-grained (parallel) incomplete LU factorization - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. - * - * A: symmetric, half-stored (lower triangular), CSR format - * sweeps: number of loops over non-zeros for computation - * L / U: factorized triangular matrices (A \approx LU), CSR format */ -static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x, y, ei_x, ei_y; - real *D, *D_inv, sum, start; - sparse_matrix *DAD; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); - D[i] = 1.0 / D_inv[i]; -// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); - } - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.0; - } - - /* initial guesses for L and U, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - /* store U^T in CSR for row-wise access and tranpose later */ - memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - - /* L has unit diagonal, by convention */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) default(none) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - L->val[L->start[i + 1] - 1] = 1.0; - } - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero in L */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:j-1), U(1:j-1,j) ) */ - while ( L->j[x] < L->j[j] && - L->j[y] < L->j[j] && - x < ei_x && y < ei_y ) - { - if ( L->j[x] == L->j[y] ) - { - sum += (L->val[x] * U->val[y]); - ++x; - ++y; - } - else if ( L->j[x] < L->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - if ( j != ei_x - 1 ) - { - L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1]; - } - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:i-1), U(1:i-1,j) ) */ - while ( U->j[x] < U->j[j] && - U->j[y] < U->j[j] && - x < ei_x && y < ei_y ) - { - if ( U->j[x] == U->j[y] ) - { - sum += (L->val[y] * U->val[x]); - ++x; - ++y; - } - else if ( U->j[x] < U->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - U->val[j] = DAD->val[j] - sum; - } - } - - /* apply inverse transformation: - * since DAD \approx LU, then - * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D_inv) private(i, pj) -#endif - for ( i = 0; i < DAD->n; ++i ) - { - for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) - { - L->val[pj] = D_inv[i] * L->val[pj]; - /* currently storing U^T, so use row index instead of column index */ - U->val[pj] = U->val[pj] * D_inv[i]; - } - } - - Transpose_I( U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 ); - fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -#endif - - Deallocate_Matrix( DAD ); - sfree( D_inv, "ILU_PAR::D_inv" ); - sfree( D, "ILU_PAR::D_inv" ); - - return Get_Timing_Info( start ); -} - - -/* Fine-grained (parallel) incomplete LU factorization with thresholding - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. - * - * A: symmetric, half-stored (lower triangular), CSR format - * droptol: row-wise tolerances used for dropping - * sweeps: number of loops over non-zeros for computation - * L / U: factorized triangular matrices (A \approx LU), CSR format */ -static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, - const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; - real *D, *D_inv, sum, start; - sparse_matrix *DAD, *L_temp, *U_temp; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE || - Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE ) - { - fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); - D[i] = 1.0 / D_inv[i]; - } - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(D(A))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.0; - } - - /* initial guesses for L and U, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - /* store U^T in CSR for row-wise access and tranpose later */ - memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - - /* L has unit diagonal, by convention */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) shared(L_temp) -#endif - for ( i = 0; i < A->n; ++i ) - { - L_temp->val[L_temp->start[i + 1] - 1] = 1.0; - } - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero in L */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:j-1), U(1:j-1,j) ) */ - while ( L_temp->j[x] < L_temp->j[j] && - L_temp->j[y] < L_temp->j[j] && - x < ei_x && y < ei_y ) - { - if ( L_temp->j[x] == L_temp->j[y] ) - { - sum += (L_temp->val[x] * U_temp->val[y]); - ++x; - ++y; - } - else if ( L_temp->j[x] < L_temp->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - if ( j != ei_x - 1 ) - { - L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; - } - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:i-1), U(1:i-1,j) ) */ - while ( U_temp->j[x] < U_temp->j[j] && - U_temp->j[y] < U_temp->j[j] && - x < ei_x && y < ei_y ) - { - if ( U_temp->j[x] == U_temp->j[y] ) - { - sum += (L_temp->val[y] * U_temp->val[x]); - ++x; - ++y; - } - else if ( U_temp->j[x] < U_temp->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - U_temp->val[j] = DAD->val[j] - sum; - } - } - - /* apply inverse transformation: - * since DAD \approx LU, then - * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) -#endif - for ( i = 0; i < DAD->n; ++i ) - { - for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) - { - L_temp->val[pj] = D_inv[i] * L_temp->val[pj]; - /* currently storing U^T, so use row index instead of column index */ - U_temp->val[pj] = U_temp->val[pj] * D_inv[i]; - } - } - - /* apply the dropping rule */ - Ltop = 0; - Utop = 0; - for ( i = 0; i < DAD->n; ++i ) - { - L->start[i] = Ltop; - U->start[i] = Utop; - - for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj ) - { - if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) ) - { - L->j[Ltop] = L_temp->j[pj]; - L->val[Ltop] = L_temp->val[pj]; - ++Ltop; - } - } - - /* diagonal */ - L->j[Ltop] = L_temp->j[pj]; - L->val[Ltop] = L_temp->val[pj]; - ++Ltop; - - for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj ) - { - if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) ) - { - U->j[Utop] = U_temp->j[pj]; - U->val[Utop] = U_temp->val[pj]; - ++Utop; - } - } - - /* diagonal */ - U->j[Utop] = U_temp->j[pj]; - U->val[Utop] = U_temp->val[pj]; - ++Utop; - } - - L->start[i] = Ltop; - U->start[i] = Utop; - - Transpose_I( U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d\n", L->start[L->n] ); - fprintf( stderr, "nnz(U): %d\n", U->start[U->n] ); -#endif - - Deallocate_Matrix( U_temp ); - Deallocate_Matrix( L_temp ); - Deallocate_Matrix( DAD ); - sfree( D_inv, "ILUT_PAR::D_inv" ); - sfree( D, "ILUT_PAR::D_inv" ); - - return Get_Timing_Info( start ); -} - - static void Extrapolate_Charges_QEq( const reax_system * const system, const control_params * const control, simulation_data * const data, static_storage * const workspace ) @@ -1584,6 +202,17 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, #endif break; + case SAI_PC: +#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) + data->timing.cm_solver_pre_comp += + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1591,18 +220,22 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, } #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC && control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } @@ -1831,22 +464,26 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, // } // //#if defined(DEBUG) +//#define SIZE (1000) +// char fname[SIZE]; +// // if ( control->cm_solver_pre_comp_type != DIAG_PC ) // { // fprintf( stderr, "condest = %f\n", condest(workspace->L) ); // //#if defined(DEBUG_FOCUS) -// sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->L, fname, NULL ); -// sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->U, fname, NULL ); // // fprintf( stderr, "icholt-" ); -// sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->L, fname, NULL ); // Print_Sparse_Matrix( U ); //#endif // } +//#undef SIZE //#endif //} @@ -1928,6 +565,17 @@ static void Compute_Preconditioner_EE( const reax_system * const system, #endif break; + case SAI_PC: +#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) + data->timing.cm_solver_pre_comp += + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1937,18 +585,22 @@ static void Compute_Preconditioner_EE( const reax_system * const system, Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC && control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } @@ -2031,6 +683,17 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, #endif break; + case SAI_PC: +#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) + data->timing.cm_solver_pre_comp += + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2041,24 +704,27 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0; #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC || control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } -/* Setup routines before computing the preconditioner for QEq - */ + static void Setup_Preconditioner_QEq( const reax_system * const system, const control_params * const control, simulation_data * const data, static_storage * const workspace, @@ -2194,6 +860,11 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, } break; + case SAI_PC: + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + &workspace->H_spar_patt ); + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2341,6 +1012,11 @@ static void Setup_Preconditioner_EE( const reax_system * const system, } break; + case SAI_PC: + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + &workspace->H_spar_patt ); + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2490,6 +1166,11 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, } break; + case SAI_PC: + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + &workspace->H_spar_patt ); + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2509,7 +1190,8 @@ static void Calculate_Charges_QEq( const reax_system * const system, int i; real u, s_sum, t_sum; - s_sum = t_sum = 0.; + s_sum = 0.0; + t_sum = 0.0; for ( i = 0; i < system->N_cm; ++i ) { s_sum += workspace->s[0][i]; @@ -2787,17 +1469,18 @@ void Compute_Charges( reax_system * const system, control_params * const control const reax_list * const far_nbrs, const output_controls * const out_control ) { #if defined(DEBUG_FOCUS) - char fname[200]; +#define SIZE (200) + char fname[SIZE]; FILE * fp; if ( data->step >= 100 ) { - sprintf( fname, "s_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "s_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->s[0], system->N_cm ); fclose( fp ); - sprintf( fname, "t_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "t_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->t[0], system->N_cm ); fclose( fp ); @@ -2827,19 +1510,20 @@ void Compute_Charges( reax_system * const system, control_params * const control #if defined(DEBUG_FOCUS) if ( data->step >= 100 ) { - sprintf( fname, "H_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "H_%d_%s.out", data->step, control->sim_name ); Print_Sparse_Matrix2( workspace->H, fname, NULL ); // Print_Sparse_Matrix_Binary( workspace->H, fname ); - sprintf( fname, "b_s_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "b_s_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->b_s, system->N_cm ); fclose( fp ); - sprintf( fname, "b_t_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "b_t_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->b_t, system->N_cm ); fclose( fp ); } +#undef SIZE #endif } diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c index ead90358..4f5616ab 100644 --- a/sPuReMD/src/control.c +++ b/sPuReMD/src/control.c @@ -79,6 +79,7 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, control->cm_domain_sparsity = 1.0; control->cm_solver_pre_comp_type = ICHOLT_PC; control->cm_solver_pre_comp_sweeps = 3; + control->cm_solver_pre_comp_sai_thres = 0.1; control->cm_solver_pre_comp_refactor = 100; control->cm_solver_pre_comp_droptol = 0.01; control->cm_solver_pre_app_type = TRI_SOLVE_PA; @@ -305,6 +306,11 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, ival = atoi( tmp[1] ); control->cm_solver_pre_comp_sweeps = ival; } + else if ( strcmp(tmp[0], "cm_solver_pre_comp_sai_thres") == 0 ) + { + val = atof( tmp[1] ); + control->cm_solver_pre_comp_sai_thres = val; + } else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 ) { ival = atoi( tmp[1] ); diff --git a/sPuReMD/src/geo_tools.c b/sPuReMD/src/geo_tools.c index 48eb0dbb..fcde21be 100644 --- a/sPuReMD/src/geo_tools.c +++ b/sPuReMD/src/geo_tools.c @@ -482,7 +482,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data, (system->box.box_norms[2] * system->box.box_norms[1]) ); /*open pdb and write header*/ - sprintf( fname, "%s-%d.pdb", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 9, "%s-%d.pdb", control->sim_name, data->step ); pdb = fopen(fname, "w"); fprintf( pdb, PDB_CRYST1_FORMAT_O, "CRYST1", @@ -498,7 +498,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data, p_atom = &(system->atoms[i]); strncpy(name, p_atom->name, 8); Trim_Spaces(name); - sprintf( line, PDB_ATOM_FORMAT_O, + snprintf( line, MAX_STR, PDB_ATOM_FORMAT_O, "ATOM ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ', p_atom->x[0], p_atom->x[1], p_atom->x[2], 1.0, 0.0, "0", name, " " ); diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c index d4175d49..c5728825 100644 --- a/sPuReMD/src/init_md.c +++ b/sPuReMD/src/init_md.c @@ -321,6 +321,8 @@ void Init_Workspace( reax_system *system, control_params *control, workspace->H = NULL; workspace->H_sp = NULL; workspace->L = NULL; + workspace->H_spar_patt = NULL; + workspace->H_app_inv = NULL; workspace->U = NULL; workspace->Hdia_inv = NULL; if ( control->cm_solver_pre_comp_type == ICHOLT_PC || @@ -638,7 +640,8 @@ void Init_Lists( reax_system *system, control_params *control, void Init_Out_Controls( reax_system *system, control_params *control, static_storage *workspace, output_controls *out_control ) { - char temp[1000]; +#define TEMP_SIZE (1000) + char temp[TEMP_SIZE]; /* Init trajectory file */ if ( out_control->write_steps > 0 ) @@ -697,11 +700,11 @@ void Init_Out_Controls( reax_system *system, control_params *control, /* Init molecular analysis file */ if ( control->molec_anal ) { - sprintf( temp, "%s.mol", control->sim_name ); + snprintf( temp, TEMP_SIZE + 4, "%s.mol", control->sim_name ); out_control->mol = fopen( temp, "w" ); if ( control->num_ignored ) { - sprintf( temp, "%s.ign", control->sim_name ); + snprintf( temp, TEMP_SIZE + 4, "%s.ign", control->sim_name ); out_control->ign = fopen( temp, "w" ); } } @@ -858,6 +861,8 @@ void Init_Out_Controls( reax_system *system, control_params *control, fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); exit( CANNOT_OPEN_FILE ); }*/ + +#undef TEMP_SIZE } @@ -1001,6 +1006,11 @@ void Finalize_Workspace( reax_system *system, control_params *control, Deallocate_Matrix( workspace->L ); Deallocate_Matrix( workspace->U ); } + if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + Deallocate_Matrix( workspace->H_spar_patt ); + Deallocate_Matrix( workspace->H_app_inv ); + } for ( i = 0; i < 5; ++i ) { diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 2094d4f4..c6f56ed4 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -20,11 +20,26 @@ ----------------------------------------------------------------------*/ #include "lin_alg.h" - #include "allocate.h" #include "tool_box.h" #include "vector.h" +#include "print_utils.h" + +/* Intel MKL */ +#if defined(HAVE_LAPACKE_MKL) +#include "mkl.h" +/* reference LAPACK */ +#elif defined(HAVE_LAPACKE) +#include "lapacke.h" +#endif + +typedef struct +{ + unsigned int j; + real val; +} sparse_matrix_entry; + /* global to make OpenMP shared (Sparse_MatVec) */ #ifdef _OPENMP @@ -61,13 +76,1733 @@ sparse_matrix *H_p; real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL; +#if defined(TEST_MAT) +static sparse_matrix * create_test_mat( void ) +{ + unsigned int i, n; + sparse_matrix *H_test; + + if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE ) + { + fprintf( stderr, "not enough memory for test matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + //3x3, SPD, store lower half + i = 0; + n = 0; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = 4.; + ++i; + ++n; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = 12.; + ++i; + H_test->j[i] = 1; + H_test->val[i] = 37.; + ++i; + ++n; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = -16.; + ++i; + H_test->j[i] = 1; + H_test->val[i] = -43.; + ++i; + H_test->j[i] = 2; + H_test->val[i] = 98.; + ++i; + ++n; + H_test->start[n] = i; + + return H_test; +} +#endif + + +/* Routine used with qsort for sorting nonzeros within a sparse matrix row + * + * v1/v2: pointers to column indices of nonzeros within a row (unsigned int) + */ +static int compare_matrix_entry(const void *v1, const void *v2) +{ + /* larger element has larger column index */ + return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; +} + + +/* Routine used for sorting nonzeros within a sparse matrix row; + * internally, a combination of qsort and manual sorting is utilized + * (parallel calls to qsort when multithreading, rows mapped to threads) + * + * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format + */ +void Sort_Matrix_Rows( sparse_matrix * const A ) +{ + unsigned int i, j, si, ei; + sparse_matrix_entry *temp; + +#ifdef _OPENMP + // #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) +#endif + { + if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL ) + { + fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* sort each row of A using column indices */ +#ifdef _OPENMP + // #pragma omp for schedule(guided) +#endif + for ( i = 0; i < A->n; ++i ) + { + si = A->start[i]; + ei = A->start[i + 1]; + + for ( j = 0; j < (ei - si); ++j ) + { + (temp + j)->j = A->j[si + j]; + (temp + j)->val = A->val[si + j]; + } + + /* polymorphic sort in standard C library using column indices */ + qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); + + for ( j = 0; j < (ei - si); ++j ) + { + A->j[si + j] = (temp + j)->j; + A->val[si + j] = (temp + j)->val; + } + } + + sfree( temp, "Sort_Matrix_Rows::temp" ); + } +} + + +/* Convert a symmetric, half-sored sparse matrix into + * a full-stored sparse matrix + * + * A: symmetric sparse matrix, lower half stored in CSR + * A_full: resultant full sparse matrix in CSR + * If A_full is NULL, allocate space, otherwise do not + * + * Assumptions: + * A has non-zero diagonals + * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ +static void compute_full_sparse_matrix( const sparse_matrix * const A, + sparse_matrix ** A_full ) +{ + int count, i, pj; + sparse_matrix *A_t; + + if ( *A_full == NULL ) + { + if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE ) + { + fprintf( stderr, "not enough memory for full A. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + + + if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for full A. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* Set up the sparse matrix data structure for A. */ + Transpose( A, A_t ); + + count = 0; + for ( i = 0; i < A->n; ++i ) + { + + if ((*A_full)->start == NULL) + { + } + (*A_full)->start[i] = count; + + /* A: symmetric, lower triangular portion only stored */ + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + (*A_full)->val[count] = A->val[pj]; + (*A_full)->j[count] = A->j[pj]; + ++count; + } + /* A^T: symmetric, upper triangular portion only stored; + * skip diagonal from A^T, as included from A above */ + for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) + { + (*A_full)->val[count] = A_t->val[pj]; + (*A_full)->j[count] = A_t->j[pj]; + ++count; + } + } + (*A_full)->start[i] = count; + + Deallocate_Matrix( A_t ); +} + + +/* Setup routines for sparse approximate inverse preconditioner + * + * A: symmetric sparse matrix, lower half stored in CSR + * filter: + * A_spar_patt: + * + * Assumptions: + * A has non-zero diagonals + * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ +void Setup_Sparsity_Pattern( const sparse_matrix * const A, + const real filter, sparse_matrix ** A_spar_patt ) +{ + int i, pj, size; + real min, max, threshold, val; + + min = 0.0; + max = 0.0; + + if ( *A_spar_patt == NULL ) + { + if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + else if ( ((*A_spar_patt)->m) < (A->m) ) + { + Deallocate_Matrix( *A_spar_patt ); + if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + + // find min and max element of the matrix + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + val = A->val[pj]; + if ( pj == 0 ) + { + min = val; + max = val; + } + else + { + if ( min > val ) + { + min = val; + } + if ( max < val ) + { + max = val; + } + } + } + } + + threshold = min + (max - min) * (1.0 - filter); + // calculate the nnz of the sparsity pattern + // for ( size = 0, i = 0; i < A->n; ++i ) + // { + // for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + // { + // if ( threshold <= A->val[pj] ) + // size++; + // } + // } + // + // if ( Allocate_Matrix( A_spar_patt, A->n, size ) == NULL ) + // { + // fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + // exit( INSUFFICIENT_MEMORY ); + // } + + //(*A_spar_patt)->start[(*A_spar_patt)->n] = size; + + // fill the sparsity pattern + for ( size = 0, i = 0; i < A->n; ++i ) + { + (*A_spar_patt)->start[i] = size; + + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + if ( ( A->val[pj] >= threshold ) || ( A->j[pj] == i ) ) + { + (*A_spar_patt)->val[size] = A->val[pj]; + (*A_spar_patt)->j[size] = A->j[pj]; + size++; + } + } + } + (*A_spar_patt)->start[A->n] = size; +} + +void Calculate_Droptol( const sparse_matrix * const A, + real * const droptol, const real dtol ) +{ + int i, j, k; + real val; +#ifdef _OPENMP + static real *droptol_local; + unsigned int tid; +#endif + +#ifdef _OPENMP + #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) +#endif + { +#ifdef _OPENMP + tid = omp_get_thread_num(); + + #pragma omp master + { + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( droptol_local == NULL ) + { + if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL ) + { + fprintf( stderr, "Not enough space for droptol. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + } + + #pragma omp barrier +#endif + + /* init droptol to 0 */ + for ( i = 0; i < A->n; ++i ) + { +#ifdef _OPENMP + droptol_local[tid * A->n + i] = 0.0; +#else + droptol[i] = 0.0; +#endif + } + +#ifdef _OPENMP + #pragma omp barrier +#endif + + /* calculate sqaure of the norm of each row */ +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) + { + j = A->j[k]; + val = A->val[k]; + +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; + droptol_local[tid * A->n + j] += val * val; +#else + droptol[i] += val * val; + droptol[j] += val * val; +#endif + } + + // diagonal entry + val = A->val[k]; +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; +#else + droptol[i] += val * val; +#endif + } + +#ifdef _OPENMP + #pragma omp barrier + + #pragma omp for schedule(static) + for ( i = 0; i < A->n; ++i ) + { + droptol[i] = 0.0; + for ( k = 0; k < omp_get_num_threads(); ++k ) + { + droptol[i] += droptol_local[k * A->n + i]; + } + } + + #pragma omp barrier +#endif + + /* calculate local droptol for each row */ + //fprintf( stderr, "droptol: " ); +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A->n; ++i ) + { + //fprintf( stderr, "%f-->", droptol[i] ); + droptol[i] = SQRT( droptol[i] ) * dtol; + //fprintf( stderr, "%f ", droptol[i] ); + } + //fprintf( stderr, "\n" ); + } +} + + +int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol ) +{ + int i, pj; + int fillin; + real val; + + fillin = 0; + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i, pj, val) reduction(+: fillin) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + val = A->val[pj]; + + if ( FABS(val) > droptol[i] ) + { + ++fillin; + } + } + } + + return fillin + A->n; +} + + +#if defined(HAVE_SUPERLU_MT) +real SuperLU_Factorize( const sparse_matrix * const A, + sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, pj, count, *Ltop, *Utop, r; + sparse_matrix *A_t; + SuperMatrix A_S, AC_S, L_S, U_S; + NCformat *A_S_store; + SCPformat *L_S_store; + NCPformat *U_S_store; + superlumt_options_t superlumt_options; + pxgstrf_shared_t pxgstrf_shared; + pdgstrf_threadarg_t *pdgstrf_threadarg; + int_t nprocs; + fact_t fact; + trans_t trans; + yes_no_t refact, usepr; + real u, drop_tol; + real *a, *at; + int_t *asub, *atsub, *xa, *xat; + int_t *perm_c; /* column permutation vector */ + int_t *perm_r; /* row permutations from partial pivoting */ + void *work; + int_t info, lwork; + int_t permc_spec, panel_size, relax; + Gstat_t Gstat; + flops_t flopcnt; + + /* Default parameters to control factorization. */ +#ifdef _OPENMP + //TODO: set as global parameter and use + #pragma omp parallel \ + default(none) shared(nprocs) + { + #pragma omp master + { + /* SuperLU_MT spawns threads internally, so set and pass parameter */ + nprocs = omp_get_num_threads(); + } + } +#else + nprocs = 1; +#endif + + // fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ + fact = DOFACT; /* factor from scratch */ + trans = NOTRANS; + refact = NO; /* first time factorization */ + //TODO: add to control file and use the value there to set these + panel_size = sp_ienv(1); /* # consec. cols treated as unit task */ + relax = sp_ienv(2); /* # cols grouped as relaxed supernode */ + u = 1.0; /* diagonal pivoting threshold */ + usepr = NO; + drop_tol = 0.0; + work = NULL; + lwork = 0; + +#if defined(DEBUG) + fprintf( stderr, "nprocs = %d\n", nprocs ); + fprintf( stderr, "Panel size = %d\n", panel_size ); + fprintf( stderr, "Relax = %d\n", relax ); +#endif + + if ( !(perm_r = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for perm_r[]."); + } + if ( !(perm_c = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for perm_c[]."); + } + if ( !(superlumt_options.etree = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for etree[]."); + } + if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for colcnt_h[]."); + } + if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for part_super__h[]."); + } + if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL ) + || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) + || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) + || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) + || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) + { + fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* Set up the sparse matrix data structure for A. */ + Transpose( A, A_t ); + + count = 0; + for ( i = 0; i < A->n; ++i ) + { + xa[i] = count; + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + a[count] = A->entries[pj].val; + asub[count] = A->entries[pj].j; + ++count; + } + for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) + { + a[count] = A_t->entries[pj].val; + asub[count] = A_t->entries[pj].j; + ++count; + } + } + xa[i] = count; + + dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, + &at, &atsub, &xat ); + + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6d", asub[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6.1f", a[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i <= A->n; ++i ) + fprintf( stderr, "%6d", xa[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6d", atsub[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6.1f", at[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i <= A->n; ++i ) + fprintf( stderr, "%6d", xat[i] ); + fprintf( stderr, "\n" ); + + A_S.Stype = SLU_NC; /* column-wise, no supernode */ + A_S.Dtype = SLU_D; /* double-precision */ + A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */ + A_S.nrow = A->n; + A_S.ncol = A->n; + A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); + A_S_store = (NCformat *) A_S.Store; + A_S_store->nnz = 2 * A->start[A->n] - A->n; + A_S_store->nzval = at; + A_S_store->rowind = atsub; + A_S_store->colptr = xat; + + /* ------------------------------------------------------------ + Allocate storage and initialize statistics variables. + ------------------------------------------------------------*/ + StatAlloc( A->n, nprocs, panel_size, relax, &Gstat ); + StatInit( A->n, nprocs, &Gstat ); + + /* ------------------------------------------------------------ + Get column permutation vector perm_c[], according to permc_spec: + permc_spec = 0: natural ordering + permc_spec = 1: minimum degree ordering on structure of A'*A + permc_spec = 2: minimum degree ordering on structure of A'+A + permc_spec = 3: approximate minimum degree for unsymmetric matrices + ------------------------------------------------------------*/ + permc_spec = 0; + get_perm_c( permc_spec, &A_S, perm_c ); + + /* ------------------------------------------------------------ + Initialize the option structure superlumt_options using the + user-input parameters; + Apply perm_c to the columns of original A to form AC. + ------------------------------------------------------------*/ + pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); + + for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); + fprintf( stderr, "\n" ); + + /* ------------------------------------------------------------ + Compute the LU factorization of A. + The following routine will create nprocs threads. + ------------------------------------------------------------*/ + pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info ); + + fprintf( stderr, "INFO: %d\n", info ); + + flopcnt = 0; + for (i = 0; i < nprocs; ++i) + { + flopcnt += Gstat.procstat[i].fcops; + } + Gstat.ops[FACT] = flopcnt; + +#if defined(DEBUG) + printf("\n** Result of sparse LU **\n"); + L_S_store = (SCPformat *) L_S.Store; + U_S_store = (NCPformat *) U_S.Store; + printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); + printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); + fflush( stdout ); +#endif + + /* convert L and R from SuperLU formats to CSR */ + memset( Ltop, 0, (A->n + 1) * sizeof(int) ); + memset( Utop, 0, (A->n + 1) * sizeof(int) ); + memset( L->start, 0, (A->n + 1) * sizeof(int) ); + memset( U->start, 0, (A->n + 1) * sizeof(int) ); + + for ( i = 0; i < 2 * L_S_store->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < 2 * U_S_store->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] ); + fprintf( stderr, "\n" ); + + printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper ); + for ( i = 0; i < A->n; ++i ) + { + fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] ); + fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); + //TODO: correct for SCPformat for L? + //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) + // for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) + // { + // ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; + // } + fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); + fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + ++Utop[U_S_store->rowind[pj] + 1]; + fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] ); + } + } + for ( i = 1; i <= A->n; ++i ) + { + // Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; + Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; + // fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); + // fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); + } + for ( i = 0; i < A->n; ++i ) + { + // for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) + // { + // r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; + // L->entries[Ltop[r]].j = r; + // L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; + // ++Ltop[r]; + // } + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + r = U_S_store->rowind[pj]; + U->entries[Utop[r]].j = i; + U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj]; + ++Utop[r]; + } + } + + /* ------------------------------------------------------------ + Deallocate storage after factorization. + ------------------------------------------------------------*/ + pxgstrf_finalize( &superlumt_options, &AC_S ); + Deallocate_Matrix( A_t ); + sfree( xa, "SuperLU_Factorize::xa" ); + sfree( asub, "SuperLU_Factorize::asub" ); + sfree( a, "SuperLU_Factorize::a" ); + SUPERLU_FREE( perm_r ); + SUPERLU_FREE( perm_c ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval ); + SUPERLU_FREE( A_S.Store ); + if ( lwork == 0 ) + { + Destroy_SuperNode_SCP(&L_S); + Destroy_CompCol_NCP(&U_S); + } + else if ( lwork > 0 ) + { + SUPERLU_FREE(work); + } + StatFree(&Gstat); + + sfree( Utop, "SuperLU_Factorize::Utop" ); + sfree( Ltop, "SuperLU_Factorize::Ltop" ); + + //TODO: return iters + return 0.; +} +#endif + + +/* Diagonal (Jacobi) preconditioner computation */ +real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) +{ + unsigned int i; + real start; + + start = Get_Time( ); + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i) +#endif + for ( i = 0; i < H->n; ++i ) + { + if ( H->val[H->start[i + 1] - 1] != 0.0 ) + { + Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1]; + } + else + { + Hdia_inv[i] = 1.0; + } + } + + return Get_Timing_Info( start ); +} + + +/* Incomplete Cholesky factorization with dual thresholding */ +real ICHOLT( const sparse_matrix * const A, const real * const droptol, + sparse_matrix * const L, sparse_matrix * const U ) +{ + int *tmp_j; + real *tmp_val; + int i, j, pj, k1, k2, tmptop, Ltop; + real val, start; + unsigned int *Utop; + + start = Get_Time( ); + + if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL || + ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL || + ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + // clear variables + Ltop = 0; + tmptop = 0; + memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( Utop, 0, A->n * sizeof(unsigned int) ); + + for ( i = 0; i < A->n; ++i ) + { + L->start[i] = Ltop; + tmptop = 0; + + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + j = A->j[pj]; + val = A->val[pj]; + + if ( FABS(val) > droptol[i] ) + { + k1 = 0; + k2 = L->start[j]; + while ( k1 < tmptop && k2 < L->start[j + 1] ) + { + if ( tmp_j[k1] < L->j[k2] ) + { + ++k1; + } + else if ( tmp_j[k1] > L->j[k2] ) + { + ++k2; + } + else + { + val -= (tmp_val[k1++] * L->val[k2++]); + } + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->val[L->start[j + 1] - 1]; + + tmp_j[tmptop] = j; + tmp_val[tmptop] = val; + ++tmptop; + } + } + + // sanity check + if ( A->j[pj] != i ) + { + fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i ); + exit( NUMERIC_BREAKDOWN ); + } + + // compute the ith diagonal in L + val = A->val[pj]; + for ( k1 = 0; k1 < tmptop; ++k1 ) + { + val -= (tmp_val[k1] * tmp_val[k1]); + } + +#if defined(DEBUG) + if ( val < 0.0 ) + { + fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i ); + exit( NUMERIC_BREAKDOWN ); + + } +#endif + + tmp_j[tmptop] = i; + tmp_val[tmptop] = SQRT( val ); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for ( k1 = 0; k1 < tmptop; ++k1 ) + { + if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) + { + L->j[Ltop] = tmp_j[k1]; + L->val[Ltop] = tmp_val[k1]; + U->start[tmp_j[k1] + 1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + } + // keep the diagonal in any case + L->j[Ltop] = tmp_j[k1]; + L->val[Ltop] = tmp_val[k1]; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; + // fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + /* U = L^T (Cholesky factorization) */ + Transpose( L, U ); + // for ( i = 1; i <= U->n; ++i ) + // { + // Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; + // } + // for ( i = 0; i < L->n; ++i ) + // { + // for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) + // { + // j = L->j[pj]; + // U->j[Utop[j]] = i; + // U->val[Utop[j]] = L->val[pj]; + // Utop[j]++; + // } + // } + + // fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); + + sfree( tmp_val, "ICHOLT::tmp_val" ); + sfree( tmp_j, "ICHOLT::tmp_j" ); + sfree( Utop, "ICHOLT::Utop" ); + + return Get_Timing_Info( start ); +} + + +/* Fine-grained (parallel) incomplete Cholesky factorization + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. */ +#if defined(TESTING) +real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, + sparse_matrix * const U_t, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; + real *D, *D_inv, sum, start; + sparse_matrix *DAD; + int *Utop; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) + { + fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D_inv, D) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); + D[i] = 1. / D_inv[i]; + } + + memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) ); + memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) ); + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(D(A))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, D_inv, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.; + } + + /* initial guesses for U^T, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) ); + memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) ); + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) +#endif + for ( j = 0; j < A->start[A->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 0; k <= A->n; ++k ) + { + if ( U_t->start[k] > j ) + { + x = U_t->start[k - 1]; + ei_x = U_t->start[k]; + break; + } + } + /* column bounds of current nonzero */ + y = U_t->start[U_t->j[j]]; + ei_y = U_t->start[U_t->j[j] + 1]; + + /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */ + while ( U_t->j[x] < U_t->j[j] && + U_t->j[y] < U_t->j[j] && + x < ei_x && y < ei_y ) + { + if ( U_t->j[x] == U_t->j[y] ) + { + sum += (U_t->val[x] * U_t->val[y]); + ++x; + ++y; + } + else if ( U_t->j[x] < U_t->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + sum = DAD->val[j] - sum; + + /* diagonal entries */ + if ( (k - 1) == U_t->j[j] ) + { + /* sanity check */ + if ( sum < ZERO ) + { + fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "A(%5d,%5d) = %10.3f\n", + k - 1, A->entries[j].j, A->entries[j].val ); + fprintf( stderr, "sum = %10.3f\n", sum); +#endif + exit(NUMERIC_BREAKDOWN); + } + + U_t->val[j] = SQRT( sum ); + } + /* non-diagonal entries */ + else + { + U_t->val[j] = sum / U_t->val[ei_y - 1]; + } + } + } + + /* apply inverse transformation D^{-1}U^{T}, + * since DAD \approx U^{T}U, so + * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(guided) \ + default(none) shared(D_inv) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + U_t->val[pj] *= D_inv[i]; + } + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 ); +#endif + + /* transpose U^{T} and copy into U */ + Transpose( U_t, U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +#endif + + Deallocate_Matrix( DAD ); + sfree( D_inv, "ICHOL_PAR::D_inv" ); + sfree( D, "ICHOL_PAR::D" ); + sfree( Utop, "ICHOL_PAR::Utop" ); + + return Get_Timing_Info( start ); +} +#endif + + +/* Fine-grained (parallel) incomplete LU factorization + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. + * + * A: symmetric, half-stored (lower triangular), CSR format + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ +real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, + sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x, y, ei_x, ei_y; + real *D, *D_inv, sum, start; + sparse_matrix *DAD; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); + D[i] = 1.0 / D_inv[i]; + // printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); + } + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.0; + } + + /* initial guesses for L and U, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + /* store U^T in CSR for row-wise access and tranpose later */ + memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + + /* L has unit diagonal, by convention */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) default(none) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + L->val[L->start[i + 1] - 1] = 1.0; + } + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero in L */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:j-1), U(1:j-1,j) ) */ + while ( L->j[x] < L->j[j] && + L->j[y] < L->j[j] && + x < ei_x && y < ei_y ) + { + if ( L->j[x] == L->j[y] ) + { + sum += (L->val[x] * U->val[y]); + ++x; + ++y; + } + else if ( L->j[x] < L->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + if ( j != ei_x - 1 ) + { + L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1]; + } + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:i-1), U(1:i-1,j) ) */ + while ( U->j[x] < U->j[j] && + U->j[y] < U->j[j] && + x < ei_x && y < ei_y ) + { + if ( U->j[x] == U->j[y] ) + { + sum += (L->val[y] * U->val[x]); + ++x; + ++y; + } + else if ( U->j[x] < U->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + U->val[j] = DAD->val[j] - sum; + } + } + + /* apply inverse transformation: + * since DAD \approx LU, then + * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D_inv) private(i, pj) +#endif + for ( i = 0; i < DAD->n; ++i ) + { + for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) + { + L->val[pj] = D_inv[i] * L->val[pj]; + /* currently storing U^T, so use row index instead of column index */ + U->val[pj] = U->val[pj] * D_inv[i]; + } + } + + Transpose_I( U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 ); + fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +#endif + + Deallocate_Matrix( DAD ); + sfree( D_inv, "ILU_PAR::D_inv" ); + sfree( D, "ILU_PAR::D_inv" ); + + return Get_Timing_Info( start ); +} + + +/* Fine-grained (parallel) incomplete LU factorization with thresholding + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. + * + * A: symmetric, half-stored (lower triangular), CSR format + * droptol: row-wise tolerances used for dropping + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ +real ILUT_PAR( const sparse_matrix * const A, const real * droptol, + const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; + real *D, *D_inv, sum, start; + sparse_matrix *DAD, *L_temp, *U_temp; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE || + Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); + D[i] = 1.0 / D_inv[i]; + } + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(D(A))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.0; + } + + /* initial guesses for L and U, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + /* store U^T in CSR for row-wise access and tranpose later */ + memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + + /* L has unit diagonal, by convention */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i) shared(L_temp) +#endif + for ( i = 0; i < A->n; ++i ) + { + L_temp->val[L_temp->start[i + 1] - 1] = 1.0; + } + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero in L */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:j-1), U(1:j-1,j) ) */ + while ( L_temp->j[x] < L_temp->j[j] && + L_temp->j[y] < L_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( L_temp->j[x] == L_temp->j[y] ) + { + sum += (L_temp->val[x] * U_temp->val[y]); + ++x; + ++y; + } + else if ( L_temp->j[x] < L_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + if ( j != ei_x - 1 ) + { + L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; + } + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:i-1), U(1:i-1,j) ) */ + while ( U_temp->j[x] < U_temp->j[j] && + U_temp->j[y] < U_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( U_temp->j[x] == U_temp->j[y] ) + { + sum += (L_temp->val[y] * U_temp->val[x]); + ++x; + ++y; + } + else if ( U_temp->j[x] < U_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + U_temp->val[j] = DAD->val[j] - sum; + } + } + + /* apply inverse transformation: + * since DAD \approx LU, then + * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) +#endif + for ( i = 0; i < DAD->n; ++i ) + { + for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) + { + L_temp->val[pj] = D_inv[i] * L_temp->val[pj]; + /* currently storing U^T, so use row index instead of column index */ + U_temp->val[pj] = U_temp->val[pj] * D_inv[i]; + } + } + + /* apply the dropping rule */ + Ltop = 0; + Utop = 0; + for ( i = 0; i < DAD->n; ++i ) + { + L->start[i] = Ltop; + U->start[i] = Utop; + + for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) ) + { + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + } + } + + /* diagonal */ + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + + for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) ) + { + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + } + + /* diagonal */ + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + + L->start[i] = Ltop; + U->start[i] = Utop; + + Transpose_I( U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d\n", L->start[L->n] ); + fprintf( stderr, "nnz(U): %d\n", U->start[U->n] ); +#endif + + Deallocate_Matrix( U_temp ); + Deallocate_Matrix( L_temp ); + Deallocate_Matrix( DAD ); + sfree( D_inv, "ILUT_PAR::D_inv" ); + sfree( D, "ILUT_PAR::D_inv" ); + + return Get_Timing_Info( start ); +} + + +#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) +real Sparse_Approx_Inverse( const sparse_matrix * const A, + const sparse_matrix * const A_spar_patt, + sparse_matrix ** A_app_inv ) +{ + int i, k, pj, j_temp, identity_pos; + int N, M, d_i, d_j; + lapack_int m, n, nrhs, lda, ldb, info; + int *pos_x, *pos_y; + real start; + real *e_j, *dense_matrix; + sparse_matrix *A_full = NULL, *A_spar_patt_full = NULL; + char *X, *Y; + + start = Get_Time( ); + + // Get A and A_spar_patt full matrices + compute_full_sparse_matrix( A, &A_full ); + compute_full_sparse_matrix( A_spar_patt, &A_spar_patt_full ); + + // A_app_inv will be the same as A_spar_patt_full except the val array + if ( Allocate_Matrix( A_app_inv, A_spar_patt_full->n, A_spar_patt_full->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt_full->start[A_spar_patt_full->n]; + +#ifdef _OPENMP + #pragma omp parallel default(none) \ + shared(A_full, A_spar_patt_full) private(i, k, pj, j_temp, identity_pos, \ + N, M, d_i, d_j, m, n, nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) +#endif + { + X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" ); + Y = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::Y" ); + pos_x = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_x" ); + pos_y = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_y" ); + + for ( i = 0; i < A->n; ++i ) + { + X[i] = 0; + Y[i] = 0; + pos_x[i] = 0; + pos_y[i] = 0; + } + +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A_spar_patt_full->n; ++i ) + { + N = 0; + M = 0; + + // find column indices of nonzeros (which will be the columns indices of the dense matrix) + for ( pj = A_spar_patt_full->start[i]; pj < A_spar_patt_full->start[i + 1]; ++pj ) + { + + j_temp = A_spar_patt_full->j[pj]; + + Y[j_temp] = 1; + pos_y[j_temp] = N; + ++N; + + // for each of those indices + // search through the row of full A of that index + for ( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k ) + { + // and accumulate the nonzero column indices to serve as the row indices of the dense matrix + X[A_full->j[k]] = 1; + } + } + + // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix + identity_pos = M; + for ( k = 0; k < A_full->n; k++) + { + if ( X[k] != 0 ) + { + pos_x[M] = k; + if ( k == i ) + { + identity_pos = M; + } + ++M; + } + } + + // allocate memory for NxM dense matrix + dense_matrix = (real *) smalloc( sizeof(real) * N * M, + "Sparse_Approx_Inverse::dense_matrix" ); + + // fill in the entries of dense matrix + for ( d_i = 0; d_i < M; ++d_i) + { + // all rows are initialized to zero + for ( d_j = 0; d_j < N; ++d_j ) + { + dense_matrix[d_i * N + d_j] = 0.0; + } + // change the value if any of the column indices is seen + for ( d_j = A_full->start[pos_x[d_i]]; + d_j < A_full->start[pos_x[d_i + 1]]; ++d_j ) + { + if ( Y[A_full->j[d_j]] == 1 ) + { + dense_matrix[d_i * N + pos_y[A_full->j[d_j]]] = A_full->val[d_j]; + } + } + + } + + /* create the right hand side of the linear equation + that is the full column of the identity matrix*/ + e_j = (real *) smalloc( sizeof(real) * M, + "Sparse_Approx_Inverse::e_j" ); + + for ( k = 0; k < M; ++k ) + { + e_j[k] = 0.0; + } + e_j[identity_pos] = 1.0; + + /* Solve the overdetermined system AX = B through the least-squares problem: + * min ||B - AX||_2 */ + m = M; + n = N; + nrhs = 1; + lda = N; + ldb = nrhs; + info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, + e_j, ldb ); + + /* Check for the full rank */ + if ( info > 0 ) + { + fprintf( stderr, "The diagonal element %i of the triangular factor ", info ); + fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); + fprintf( stderr, "the least squares solution could not be computed.\n" ); + exit( INVALID_INPUT ); + } + + /* Print least squares solution */ + // print_matrix( "Least squares solution", n, nrhs, b, ldb ); + + // accumulate the resulting vector to build A_app_inv + (*A_app_inv)->start[i] = A_spar_patt_full->start[i]; + for ( k = A_spar_patt_full->start[i]; k < A_spar_patt_full->start[i + 1]; ++k) + { + (*A_app_inv)->j[k] = A_spar_patt_full->j[k]; + (*A_app_inv)->val[k] = e_j[k - A_spar_patt_full->start[i]]; + } + + //empty variables that will be used next iteration + sfree( dense_matrix, "Sparse_Approx_Inverse::dense_matrix" ); + sfree( e_j, "Sparse_Approx_Inverse::e_j" ); + for ( k = 0; k < A->n; ++k ) + { + X[k] = 0; + Y[k] = 0; + pos_x[k] = 0; + pos_y[k] = 0; + } + } + + sfree( pos_y, "Sparse_Approx_Inverse::pos_y" ); + sfree( pos_x, "Sparse_Approx_Inverse::pos_x" ); + sfree( Y, "Sparse_Approx_Inverse::Y" ); + sfree( X, "Sparse_Approx_Inverse::X" ); + } + + Deallocate_Matrix( A_full ); + Deallocate_Matrix( A_spar_patt_full ); + + return Get_Timing_Info( start ); +} +#endif + + /* sparse matrix-vector product Ax=b * where: * A: lower triangular matrix, stored in CSR format * x: vector * b: vector (result) */ static void Sparse_MatVec( const sparse_matrix * const A, - const real * const x, real * const b ) + const real * const x, real * const b ) { int i, j, k, n, si, ei; real H; @@ -137,6 +1872,31 @@ static void Sparse_MatVec( const sparse_matrix * const A, } +/* sparse matrix-vector product Ax = b + * where: + * A: matrix, stored in CSR format + * x: vector + * b: vector (result) */ +static void Sparse_MatVec_full( const sparse_matrix * const A, + const real * const x, real * const b ) +{ + int i, pj; + + Vector_MakeZero( b, A->n ); + +#ifdef _OPENMP + #pragma omp for schedule(static) default(none) private(i, j) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + b[i] += A->val[pj] * x[A->j[pj]]; + } + } +} + + /* Transpose A and copy into A^T * * A: stored in CSR @@ -217,7 +1977,7 @@ void Transpose_I( sparse_matrix * const A ) * N: dimensions of preconditioner and vectors (# rows in H) */ static void diag_pre_app( const real * const Hdia_inv, const real * const y, - real * const x, const int N ) + real * const x, const int N ) { unsigned int i; @@ -243,7 +2003,7 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ void tri_solve( const sparse_matrix * const LU, const real * const y, - real * const x, const int N, const TRIANGULARITY tri ) + real * const x, const int N, const TRIANGULARITY tri ) { int i, pj, j, si, ei; real val; @@ -301,8 +2061,8 @@ void tri_solve( const sparse_matrix * const LU, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ void tri_solve_level_sched( const sparse_matrix * const LU, - const real * const y, real * const x, const int N, - const TRIANGULARITY tri, int find_levels ) + const real * const y, real * const x, const int N, + const TRIANGULARITY tri, int find_levels ) { int i, j, pj, local_row, local_level; @@ -368,10 +2128,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } -//#if defined(DEBUG) + //#if defined(DEBUG) fprintf(stderr, "levels(L): %d\n", levels); fprintf(stderr, "NNZ(L): %d\n", LU->start[N]); -//#endif + //#endif } else { @@ -388,10 +2148,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } -//#if defined(DEBUG) + //#if defined(DEBUG) fprintf(stderr, "levels(U): %d\n", levels); fprintf(stderr, "NNZ(U): %d\n", LU->start[N]); -//#endif + //#endif } for ( i = 1; i < levels + 1; ++i ) @@ -499,7 +2259,7 @@ static void compute_H_full( const sparse_matrix * const H ) H_full->j[count] = H->j[pj]; ++count; } - /* H^T: symmetric, upper triangular portion only stored; + /* H^T: symmetric, upper triangular portion only stored; * skip diagonal from H^T, as included from H above */ for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj ) { @@ -524,7 +2284,7 @@ static void compute_H_full( const sparse_matrix * const H ) * * Reference: * Umit V. Catalyurek et al. - * Graph Coloring Algorithms for Multi-core + * Graph Coloring Algorithms for Multi-core * and Massively Threaded Architectures * Parallel Computing, 2012 */ @@ -691,15 +2451,15 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) sfree( conflict_local, "graph_coloring::conflict_local" ); sfree( fb_color, "graph_coloring::fb_color" ); -//#if defined(DEBUG) -//#ifdef _OPENMP -// #pragma omp master -//#endif -// { -// for ( i = 0; i < A->n; ++i ) -// printf("Vertex: %5d, Color: %5d\n", i, color[i] ); -// } -//#endif + //#if defined(DEBUG) + //#ifdef _OPENMP + // #pragma omp master + //#endif + // { + // for ( i = 0; i < A->n; ++i ) + // printf("Vertex: %5d, Color: %5d\n", i, color[i] ); + // } + //#endif #ifdef _OPENMP #pragma omp barrier @@ -721,7 +2481,7 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri ) /* sort vertices by color (ascending within a color) * 1) count colors - * 2) determine offsets of color ranges + * 2) determine offsets of color ranges * 3) sort by color * * note: color is 1-based */ @@ -756,7 +2516,7 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri ) * tri: coloring to triangular factor to use (lower/upper) */ static void permute_vector( real * const x, const unsigned int n, const int invert_map, - const TRIANGULARITY tri ) + const TRIANGULARITY tri ) { unsigned int i; @@ -958,7 +2718,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */ if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || - (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || + (to_color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL || (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || @@ -978,7 +2738,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) graph_coloring( H_full, LOWER ); sort_colors( H_full->n, LOWER ); - + memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) ); memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) ); memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) ); @@ -1000,8 +2760,8 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) * Note: Newmann series arises from series expansion of the inverse of * the coefficient matrix in the triangular system */ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, - const real * const b, real * const x, const TRIANGULARITY tri, const - unsigned int maxiter ) + const real * const b, real * const x, const TRIANGULARITY tri, const + unsigned int maxiter ) { unsigned int i, k, si = 0, ei = 0, iter; @@ -1108,7 +2868,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, * Matrices have non-zero diagonals * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ static void apply_preconditioner( const static_storage * const workspace, const control_params * const control, - const real * const y, real * const x, const int fresh_pre ) + const real * const y, real * const x, const int fresh_pre ) { int i, si; @@ -1133,6 +2893,9 @@ static void apply_preconditioner( const static_storage * const workspace, const tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); break; + case SAI_PC: + Sparse_MatVec_full( workspace->H_app_inv, y, x ); + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1151,6 +2914,9 @@ static void apply_preconditioner( const static_storage * const workspace, const tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); break; + case SAI_PC: + Sparse_MatVec_full( workspace->H_app_inv, y, x ); + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1161,6 +2927,7 @@ static void apply_preconditioner( const static_storage * const workspace, const switch ( control->cm_solver_pre_comp_type ) { case DIAG_PC: + case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; @@ -1170,14 +2937,14 @@ static void apply_preconditioner( const static_storage * const workspace, const #ifdef _OPENMP #pragma omp single #endif - { - memcpy( y_p, y, sizeof(real) * workspace->H->n ); - } + { + memcpy( y_p, y, sizeof(real) * workspace->H->n ); + } - permute_vector( y_p, workspace->H->n, FALSE, LOWER ); - tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); - tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); - permute_vector( x, workspace->H->n, TRUE, UPPER ); + permute_vector( y_p, workspace->H->n, FALSE, LOWER ); + tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); + permute_vector( x, workspace->H->n, TRUE, UPPER ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -1189,6 +2956,7 @@ static void apply_preconditioner( const static_storage * const workspace, const switch ( control->cm_solver_pre_comp_type ) { case DIAG_PC: + case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; @@ -1198,61 +2966,61 @@ static void apply_preconditioner( const static_storage * const workspace, const #ifdef _OPENMP #pragma omp single #endif + { + if ( Dinv_L == NULL ) { - if ( Dinv_L == NULL ) + if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) { - if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } } + } /* construct D^{-1}_L */ - if ( fresh_pre == TRUE ) - { + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP - #pragma omp for schedule(static) + #pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->L->n; ++i ) - { - si = workspace->L->start[i + 1] - 1; - Dinv_L[i] = 1. / workspace->L->val[si]; - } + for ( i = 0; i < workspace->L->n; ++i ) + { + si = workspace->L->start[i + 1] - 1; + Dinv_L[i] = 1. / workspace->L->val[si]; } + } - jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); #ifdef _OPENMP - #pragma omp single + #pragma omp single #endif + { + if ( Dinv_U == NULL ) { - if ( Dinv_U == NULL ) + if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) { - if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } } + } /* construct D^{-1}_U */ - if ( fresh_pre == TRUE ) - { + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP - #pragma omp for schedule(static) + #pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->U->n; ++i ) - { - si = workspace->U->start[i]; - Dinv_U[i] = 1. / workspace->U->val[si]; - } + for ( i = 0; i < workspace->U->n; ++i ) + { + si = workspace->U->start[i]; + Dinv_U[i] = 1. / workspace->U->val[si]; } + } - jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); - break; + jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1269,149 +3037,50 @@ static void apply_preconditioner( const static_storage * const workspace, const } -/* generalized minimual residual iterative solver for sparse linear systems */ +/* generalized minimual residual method with restarting for sparse linear systems */ int GMRES( const static_storage * const workspace, const control_params * const control, - simulation_data * const data, const sparse_matrix * const H, const real * const b, - const real tol, real * const x, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, const real * const b, + const real tol, real * const x, const int fresh_pre ) { int i, j, k, itr, N, g_j, g_itr; real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start; N = H->n; + g_j = 0; + g_itr = 0; #ifdef _OPENMP #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ - shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) + shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) #endif { j = 0; itr = 0; -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } + time_start = Get_Time( ); bnorm = Norm( b, N ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } - - if ( control->cm_solver_pre_comp_type == DIAG_PC ) - { - /* apply preconditioner to residual */ -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); - } - } + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); /* GMRES outer-loop */ for ( itr = 0; itr < control->cm_solver_max_iters; ++itr ) { /* calculate r0 */ -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } + time_start = Get_Time( ); Sparse_MatVec( H, x, workspace->b_prm ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); - } - - if ( control->cm_solver_pre_comp_type == DIAG_PC ) - { -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); - } - } + data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); - if ( control->cm_solver_pre_comp_type == DIAG_PC ) - { -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } - } - else - { -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } - } + time_start = Get_Time( ); + Vector_Sum( workspace->b_prc, 1., b, -1., workspace->b_prm, N ); + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - if ( control->cm_solver_pre_comp_type != DIAG_PC ) - { -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], - itr == 0 ? fresh_pre : FALSE ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); - } - } + time_start = Get_Time( ); + apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[0], + itr == 0 ? fresh_pre : FALSE ); + data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } + time_start = Get_Time( ); ret_temp = Norm( workspace->v[0], N ); + #ifdef _OPENMP #pragma omp single #endif @@ -1419,91 +3088,54 @@ int GMRES( const static_storage * const workspace, const control_params * const workspace->g[0] = ret_temp; } - Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } + Vector_Scale( workspace->v[0], 1. / ret_temp, workspace->v[0], N ); + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); /* GMRES inner-loop */ for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ ) { /* matvec */ -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); - -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); - } - -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } - apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], FALSE ); + time_start = Get_Time( ); + Sparse_MatVec( H, workspace->v[j], workspace->b_prc ); + data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); - } + time_start = Get_Time( ); + apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[j + 1], FALSE ); + data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); // if ( control->cm_solver_pre_comp_type == DIAG_PC ) // { - /* apply modified Gram-Schmidt to orthogonalize the new residual */ + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + time_start = Get_Time( ); + for ( i = 0; i <= j; i++ ) + { + ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); + #ifdef _OPENMP - #pragma omp master + #pragma omp single #endif { - time_start = Get_Time( ); + workspace->h[i][j] = ret_temp; } - for ( i = 0; i <= j; i++ ) - { - ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); -#ifdef _OPENMP - #pragma omp single -#endif - { - workspace->h[i][j] = ret_temp; - } + Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); - Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); - - } -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } + } + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); // } // else // { // //TODO: investigate correctness of not explicitly orthogonalizing first few vectors // /* apply modified Gram-Schmidt to orthogonalize the new residual */ -//#ifdef _OPENMP -// #pragma omp master -//#endif +// +// +// // { // time_start = Get_Time( ); // } -//#ifdef _OPENMP +// // #pragma omp single -//#endif +// // { // for ( i = 0; i < j - 1; i++ ) // { @@ -1514,30 +3146,26 @@ int GMRES( const static_storage * const workspace, const control_params * const // for ( i = MAX(j - 1, 0); i <= j; i++ ) // { // ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); -//#ifdef _OPENMP +// // #pragma omp single -//#endif +// // { // workspace->h[i][j] = ret_temp; // } // // Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); // } -//#ifdef _OPENMP -// #pragma omp master -//#endif +// +// +// // { // data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); // } // } -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); - } + time_start = Get_Time( ); ret_temp = Norm( workspace->v[j + 1], N ); + #ifdef _OPENMP #pragma omp single #endif @@ -1546,41 +3174,35 @@ int GMRES( const static_storage * const workspace, const control_params * const } Vector_Scale( workspace->v[j + 1], - 1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N ); - -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } + 1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N ); + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); -#ifdef _OPENMP - #pragma omp master -#endif - { - time_start = Get_Time( ); + time_start = Get_Time( ); // if ( control->cm_solver_pre_comp_type == NONE_PC || // control->cm_solver_pre_comp_type == DIAG_PC ) // { - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for ( i = 0; i <= j; i++ ) + /* Givens rotations on the upper-Hessenberg matrix to make it U */ +#ifdef _OPENMP + #pragma omp single +#endif + { + for ( i = 0; i <= j; i++ ) + { + if ( i == j ) { - if ( i == j ) - { - cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); - workspace->hc[j] = workspace->h[j][j] / cc; - workspace->hs[j] = workspace->h[j + 1][j] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[i][j] + + cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); + workspace->hc[j] = workspace->h[j][j] / cc; + workspace->hs[j] = workspace->h[j + 1][j] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[i][j] + workspace->hs[i] * workspace->h[i + 1][j]; - tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; + tmp2 = -workspace->hs[i] * workspace->h[i][j] + + workspace->hc[i] * workspace->h[i + 1][j]; - workspace->h[i][j] = tmp1; - workspace->h[i + 1][j] = tmp2; - } + workspace->h[i][j] = tmp1; + workspace->h[i + 1][j] = tmp2; + } // } // else // { @@ -1611,8 +3233,8 @@ int GMRES( const static_storage * const workspace, const control_params * const workspace->g[j] = tmp1; workspace->g[j + 1] = tmp2; - data->timing.cm_solver_orthog += Get_Timing_Info( time_start ); } + data->timing.cm_solver_orthog += Get_Timing_Info( time_start ); #ifdef _OPENMP #pragma omp barrier @@ -1620,12 +3242,11 @@ int GMRES( const static_storage * const workspace, const control_params * const } /* solve Hy = g: H is now upper-triangular, do back-substitution */ + time_start = Get_Time( ); #ifdef _OPENMP - #pragma omp master + #pragma omp single #endif { - time_start = Get_Time( ); - for ( i = j - 1; i >= 0; i-- ) { temp = workspace->g[i]; @@ -1636,13 +3257,11 @@ int GMRES( const static_storage * const workspace, const control_params * const workspace->y[i] = temp / workspace->h[i][i]; } - - data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start ); - - /* update x = x_0 + Vy */ - time_start = Get_Time( ); } + data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start ); + /* update x = x_0 + Vy */ + time_start = Get_Time( ); Vector_MakeZero( workspace->p, N ); for ( i = 0; i < j; i++ ) @@ -1651,13 +3270,7 @@ int GMRES( const static_storage * const workspace, const control_params * const } Vector_Add( x, 1., workspace->p, N ); - -#ifdef _OPENMP - #pragma omp master -#endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); /* stopping condition */ if ( FABS(workspace->g[j]) / bnorm <= tol ) @@ -1667,11 +3280,11 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP - #pragma omp master + #pragma omp single #endif { - g_itr = itr; g_j = j; + g_itr = itr; } } @@ -1686,9 +3299,9 @@ int GMRES( const static_storage * const workspace, const control_params * const int GMRES_HouseHolder( const static_storage * const workspace, - const control_params * const control, simulation_data * const data, - const sparse_matrix * const H, const real * const b, real tol, - real * const x, const int fresh_pre ) + const control_params * const control, simulation_data * const data, + const sparse_matrix * const H, const real * const b, real tol, + real * const x, const int fresh_pre ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -1896,7 +3509,7 @@ int CG( const static_storage * const workspace, const control_params * const con #ifdef _OPENMP #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \ - shared(itr, N, d, r, p, z) + shared(itr, N, d, r, p, z) #endif { b_norm = Norm( b, N ); @@ -1948,8 +3561,8 @@ int CG( const static_storage * const workspace, const control_params * const con /* Steepest Descent */ int SDM( const static_storage * const workspace, const control_params * const control, - const sparse_matrix * const H, const real * const b, const real tol, - real * const x, const int fresh_pre ) + const sparse_matrix * const H, const real * const b, const real tol, + real * const x, const int fresh_pre ) { int i, itr, N; real tmp, alpha, b_norm; @@ -1959,13 +3572,13 @@ int SDM( const static_storage * const workspace, const control_params * const co #ifdef _OPENMP #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \ - shared(itr, N) + shared(itr, N) #endif { b_norm = Norm( b, N ); Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->q, N ); + Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->q, N ); apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre ); diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index 3946f60a..6caf7a1a 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -30,6 +30,41 @@ typedef enum UPPER = 1, } TRIANGULARITY; +void Sort_Matrix_Rows( sparse_matrix * const ); + +void Setup_Sparsity_Pattern( const sparse_matrix * const, + const real, sparse_matrix ** ); + +int Estimate_LU_Fill( const sparse_matrix * const, const real * const ); + +void Calculate_Droptol( const sparse_matrix * const, + real * const, const real ); + +#if defined(HAVE_SUPERLU_MT) +real SuperLU_Factorize( const sparse_matrix * const, + sparse_matrix * const, sparse_matrix * const ); +#endif + +real diag_pre_comp( const sparse_matrix * const, real * const ); + +real ICHOLT( const sparse_matrix * const, const real * const, + sparse_matrix * const, sparse_matrix * const ); + +#if defined(TESTING) +real ICHOL_PAR( const sparse_matrix * const, const unsigned int, + sparse_matrix * const, sparse_matrix * const ); +#endif + +real ILU_PAR( const sparse_matrix * const, const unsigned int, + sparse_matrix * const, sparse_matrix * const ); + +real ILUT_PAR( const sparse_matrix * const, const real *, + const unsigned int, sparse_matrix * const, sparse_matrix * const ); + +#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) +real Sparse_Approx_Inverse( const sparse_matrix * const, const sparse_matrix * const, + sparse_matrix ** ); +#endif void Transpose( const sparse_matrix * const, sparse_matrix * const ); diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index f75ad168..51f24d40 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -233,6 +233,7 @@ enum pre_comp ILU_PAR_PC = 3, ILUT_PAR_PC = 4, ILU_SUPERLU_MT_PC = 5, + SAI_PC = 6, }; enum pre_app @@ -606,6 +607,7 @@ typedef struct unsigned int cm_solver_pre_comp_refactor; real cm_solver_pre_comp_droptol; unsigned int cm_solver_pre_comp_sweeps; + real cm_solver_pre_comp_sai_thres; unsigned int cm_solver_pre_app_type; unsigned int cm_solver_pre_app_jacobi_iters; @@ -889,6 +891,8 @@ typedef struct /* charge method storage */ sparse_matrix *H; sparse_matrix *H_sp; + sparse_matrix *H_spar_patt; + sparse_matrix *H_app_inv; sparse_matrix *L; sparse_matrix *U; real *droptol; diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c index 1ef0a527..383fdb89 100644 --- a/sPuReMD/src/print_utils.c +++ b/sPuReMD/src/print_utils.c @@ -406,7 +406,7 @@ void Print_Near_Neighbors( reax_system *system, control_params *control, FILE *fout; reax_list *near_nbrs = &((*lists)[NEAR_NBRS]); - sprintf( fname, "%s.near_nbrs", control->sim_name ); + snprintf( fname, MAX_STR, "%s.near_nbrs", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -440,7 +440,7 @@ void Print_Near_Neighbors2( reax_system *system, control_params *control, FILE *fout; reax_list *near_nbrs = &((*lists)[NEAR_NBRS]); - sprintf( fname, "%s.near_nbrs_lgj", control->sim_name ); + snprintf( fname, MAX_STR, "%s.near_nbrs_lgj", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -475,7 +475,7 @@ void Print_Far_Neighbors( reax_system *system, control_params *control, FILE *fout; reax_list *far_nbrs = &((*lists)[FAR_NBRS]); - sprintf( fname, "%s.far_nbrs", control->sim_name ); + snprintf( fname, MAX_STR, "%s.far_nbrs", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -520,7 +520,7 @@ void Print_Far_Neighbors2( reax_system *system, control_params *control, FILE *fout; reax_list *far_nbrs = &((*lists)[FAR_NBRS]); - sprintf( fname, "%s.far_nbrs_lgj", control->sim_name ); + snprintf( fname, MAX_STR, "%s.far_nbrs_lgj", control->sim_name ); fout = fopen( fname, "w" ); int num = 0; int temp[500]; @@ -553,7 +553,7 @@ void Print_Total_Force( reax_system *system, control_params *control, int i; #if !defined(TEST_FORCES) char temp[1000]; - sprintf( temp, "%s.ftot", control->sim_name ); + snprintf( temp, 1000, "%s.ftot", control->sim_name ); out_control->ftot = fopen( temp, "w" ); #endif @@ -735,7 +735,7 @@ void Print_Linear_System( reax_system *system, control_params *control, sparse_matrix *H; FILE *out; - sprintf( fname, "%s.state%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.state%d.out", control->sim_name, step ); out = fopen( fname, "w" ); for ( i = 0; i < system->N_cm; i++ ) @@ -747,13 +747,13 @@ void Print_Linear_System( reax_system *system, control_params *control, workspace->t[0][i], workspace->b_t[i] ); fclose( out ); - // sprintf( fname, "x2_%d", step ); + // snprintf( fname, 100, "x2_%d", step ); // out = fopen( fname, "w" ); // for( i = 0; i < system->N; i++ ) // fprintf( out, "%g\n", workspace->s_t[i+system->N] ); // fclose( out ); - sprintf( fname, "%s.H%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.H%d.out", control->sim_name, step ); out = fopen( fname, "w" ); H = workspace->H; @@ -776,7 +776,7 @@ void Print_Linear_System( reax_system *system, control_params *control, fclose( out ); - sprintf( fname, "%s.H_sp%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.H_sp%d.out", control->sim_name, step ); out = fopen( fname, "w" ); H = workspace->H_sp; @@ -799,13 +799,13 @@ void Print_Linear_System( reax_system *system, control_params *control, fclose( out ); - /*sprintf( fname, "%s.b_s%d", control->sim_name, step ); + /*snprintf( fname, 100, "%s.b_s%d", control->sim_name, step ); out = fopen( fname, "w" ); for( i = 0; i < system->N; i++ ) fprintf( out, "%12.7f\n", workspace->b_s[i] ); fclose( out ); - sprintf( fname, "%s.b_t%d", control->sim_name, step ); + snprintf( fname, 100, "%s.b_t%d", control->sim_name, step ); out = fopen( fname, "w" ); for( i = 0; i < system->N; i++ ) fprintf( out, "%12.7f\n", workspace->b_t[i] ); @@ -820,7 +820,7 @@ void Print_Charges( reax_system *system, control_params *control, char fname[100]; FILE *fout; - sprintf( fname, "%s.q%d", control->sim_name, step ); + snprintf( fname, 100, "%s.q%d", control->sim_name, step ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -1002,7 +1002,7 @@ Print_XYZ_Serial( reax_system* system, static_storage *workspace ) FILE *fout; int i; - sprintf( fname, "READ_PDB.0" ); + snprintf( fname, 100, "READ_PDB.0" ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; i++ ) diff --git a/sPuReMD/src/restart.c b/sPuReMD/src/restart.c index c15d51c5..0f7b1c5b 100644 --- a/sPuReMD/src/restart.c +++ b/sPuReMD/src/restart.c @@ -23,8 +23,9 @@ #include "box.h" #include "vector.h" + void Write_Binary_Restart( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace ) + simulation_data *data, static_storage *workspace ) { int i; char fname[MAX_STR]; @@ -33,7 +34,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control, restart_header res_header; restart_atom res_data; - sprintf( fname, "%s.res%d", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step ); fres = fopen( fname, "wb" ); res_header.step = data->step; @@ -139,14 +140,14 @@ void Read_Binary_Restart( char *fname, reax_system *system, void Write_ASCII_Restart( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace ) + simulation_data *data, static_storage *workspace ) { int i; char fname[MAX_STR]; FILE *fres; reax_atom *p_atom; - sprintf( fname, "%s.res%d", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step ); fres = fopen( fname, "w" ); fprintf( fres, RESTART_HEADER, diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c index bfdd2fcf..14e2a36b 100644 --- a/sPuReMD/src/traj.c +++ b/sPuReMD/src/traj.c @@ -31,13 +31,15 @@ int Write_Custom_Header( reax_system *system, control_params *control, static_storage *workspace, output_controls *out_control ) { +#define SIZE1 (2048) +#define SIZE2 (100) int i, header_len, control_block_len, frame_format_len; // char buffer[2048]; - char control_block[2048]; - char frame_format[2048]; - char atom_format[100], bond_format[100], angle_format[100]; + char control_block[SIZE1]; + char frame_format[SIZE1]; + char atom_format[SIZE2], bond_format[SIZE2], angle_format[SIZE2]; - sprintf( control_block, CONTROL_BLOCK, + snprintf( control_block, SIZE1, CONTROL_BLOCK, system->N, control->restart, control->restart_from, @@ -80,23 +82,23 @@ int Write_Custom_Header( reax_system *system, control_params *control, control_block_len = strlen( control_block ); - sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", + snprintf( frame_format, SIZE1, "Frame Format: %d\n%s\n%s\n", NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES ); atom_format[0] = OPT_NOATOM; switch ( out_control->atom_format ) { case OPT_ATOM_BASIC: - sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC ); + snprintf( atom_format, SIZE2, "Atom_Basic: %s", ATOM_BASIC ); break; case OPT_ATOM_wF: - sprintf( atom_format, "Atom_wF: %s", ATOM_wF ); + snprintf( atom_format, SIZE2, "Atom_wF: %s", ATOM_wF ); break; case OPT_ATOM_wV: - sprintf( atom_format, "Atom_wV: %s", ATOM_wV ); + snprintf( atom_format, SIZE2, "Atom_wV: %s", ATOM_wV ); break; case OPT_ATOM_FULL: - sprintf( atom_format, "Atom_Full: %s", ATOM_FULL ); + snprintf( atom_format, SIZE2, "Atom_Full: %s", ATOM_FULL ); break; default: break; @@ -106,18 +108,18 @@ int Write_Custom_Header( reax_system *system, control_params *control, bond_format[0] = OPT_NOBOND; if ( out_control->bond_info == OPT_BOND_BASIC ) { - sprintf( bond_format, "Bond_Line: %s", BOND_BASIC ); + snprintf( bond_format, SIZE2, "Bond_Line: %s", BOND_BASIC ); } else if ( out_control->bond_info == OPT_BOND_FULL ) { - sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL ); + snprintf( bond_format, SIZE2, "Bond_Line_Full: %s", BOND_FULL ); } strcat( frame_format, bond_format ); angle_format[0] = OPT_NOANGLE; if ( out_control->angle_info == OPT_ANGLE_BASIC ) { - sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC ); + snprintf( angle_format, SIZE2, "Angle_Line: %s", ANGLE_BASIC ); } strcat( frame_format, angle_format ); @@ -158,6 +160,9 @@ int Write_Custom_Header( reax_system *system, control_params *control, fflush( out_control->trj ); +#undef SIZE2 +#undef SIZE1 + return 0; } @@ -166,12 +171,13 @@ int Append_Custom_Frame( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, reax_list **lists, output_controls *out_control ) { +#define SIZE (2048) int i, j, pi, pk, pk_j; int write_atoms, write_bonds, write_angles; int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len; int frame_globals_len, num_bonds, num_thb_intrs; real P; - char buffer[2048]; + char buffer[SIZE]; reax_list *bonds = (*lists) + BONDS; reax_list *thb_intrs = (*lists) + THREE_BODIES; bond_data *bo_ij; @@ -290,7 +296,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control, } /* calculate total frame length*/ - sprintf( buffer, FRAME_GLOBALS, + snprintf( buffer, SIZE, FRAME_GLOBALS, data->step, data->time, data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T, P, system->box.volume, @@ -476,6 +482,8 @@ int Append_Custom_Frame( reax_system *system, control_params *control, fflush( out_control->trj ); +#undef SIZE + return 0; } -- GitLab