PG-PuReMD: fix build system linker argument ordering (LDFLAGS -> LDADD). Fix...

PG-PuReMD: fix build system linker argument ordering (LDFLAGS -> LDADD). Fix inline functions. Disable optimized CUDA code generation by default (added Makefile flags to enable along with preprocessor defs to enable relevant code paths).

PG-PuReMD: fix build system linker argument ordering (LDFLAGS -> LDADD). Fix...
PG-PuReMD: fix build system linker argument ordering (LDFLAGS -> LDADD). Fix inline functions. Disable optimized CUDA code generation by default (added Makefile flags to enable along with preprocessor defs to enable relevant code paths).
ca5e3d11 · Kurt A. O'Hearn · 26d2e66d · ca5e3d11 · ca5e3d11 · ca5e3d11
Commit ca5e3d11 authored 7 years ago by Kurt A. O'Hearn
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -5,19 +5,13 @@ SUFFIXES = .cu
 include ../cuda.am
 endif

-AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)
-AM_CPPFLAGS =
-AM_LDFLAGS = $(MPI_LDFLAGS)
-
 if USE_CUDA
-# default CUDA nvcc flags
-#   Note: cc 13 for Tesla
-#   Note: cc 20 for Fermi
-#   Note: cc 30 for Kepler K10
-#   Note: cc 35 for Kepler K20
-NVCCFLAGS += -use_fast_math 
-NVCCFLAGS += -gencode arch=compute_35,code=sm_35
-NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
+# flags for CUDA compilation via NVCC (see cuda.am)
+#   Note: cc 13 for Tesla, cc 20 for Fermi, cc 30 for Kepler K10, cc 35 for Kepler K20/K40, etc.
+#NVCCFLAGS += -use_fast_math -gencode arch=compute_35,code=sm_35
+NVCCFLAGS += -use_fast_math $(NVCC_OPT_CODE)
+#NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
+NVCCFLAGS += --compiler-options "$(DEFS) $(NVCC_OPT_CODE_DEFS) -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
 #NVCCFLAGS += --ptxas-options -v
 endif

@@ -66,11 +60,11 @@ nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
 endif


-bin_pg_puremd_CFLAGS = $(AM_CFLAGS) $(CFLAGS)
+bin_pg_puremd_CFLAGS = $(AM_CFLAGS) -Wall -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS) $(CFLAGS)
 bin_pg_puremd_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
-bin_pg_puremd_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS)
+bin_pg_puremd_LDADD = $(AM_LDADD) $(MPI_LIBS) $(LDADD) -lstdc++

 if USE_CUDA
 bin_pg_puremd_CFLAGS += $(CUDA_CFLAGS)
-bin_pg_puremd_LDFLAGS += $(CUDA_LIBS)
+bin_pg_puremd_LDADD += $(CUDA_LIBS)
 endif
--- a/PG-PuReMD/configure.ac
+++ b/PG-PuReMD/configure.ac
@@ -68,17 +68,17 @@ CC="$sav_CC"
 CFLAGS="$sav_CFLAGS"

 #
-# try to set MPI_CFLAGS and MPI_LDFLAGS
+# try to set MPI_CFLAGS and MPI_LIBS
 #
 MPI_CFLAGS=
-MPI_LDFLAGS=
+MPI_LIBS=
 if test "$mpi_vendor" = "OpenMPI"
 then 
 	MPI_CFLAGS=`$MPICC --showme:compile`
-	MPI_LDFLAGS=`$MPICC --showme:link`
+	MPI_LIBS=`$MPICC --showme:link`
 	AC_MSG_NOTICE([OpenMPI found])
 	AC_MSG_NOTICE([MPI_CFLAGS=$MPI_CFLAGS])
-	AC_MSG_NOTICE([MPI_LDFLAGS=$MPI_LDFLAGS])
+	AC_MSG_NOTICE([MPI_LIBS=$MPI_LIBS])
 elif test "$mpi_vendor" = "MPICH"
 then
 	# build MPI_CFLAGS
@@ -92,24 +92,24 @@ then
 				;;
 		esac
 	done
-	# build MPI_LDFLAGS
+	# build MPI_LIBS
 	tmp=`$MPICC -link-info | awk '{$1=""; print $0 }'`
 	for i in $tmp
 	do 
 		case $i in 
 			[[\\/]]*.a | ?:[[\\/]]*.a | -[[lLRu]]* | -Wl* )
-				MPI_LDFLAGS="$MPI_LDFLAGS $i"
+				MPI_LIBS="$MPI_LIBS $i"
 				;;
 		esac
 	done
 	AC_MSG_NOTICE([MPICH found])
 	AC_MSG_NOTICE([MPI_CFLAGS=$MPI_CFLAGS])
-	AC_MSG_NOTICE([MPI_LDFLAGS=$MPI_LDFLAGS])
+	AC_MSG_NOTICE([MPI_LIBS=$MPI_LIBS])
 else
 	AC_MSG_WARN([Neither OpenMPI and MPICH have been recognized...])
 fi
 AC_SUBST(MPI_CFLAGS)
-AC_SUBST(MPI_LDFLAGS)
+AC_SUBST(MPI_LIBS)

 # Check for CUDA support.
 if test "x$BUILD_GPU" = "xyes"; then

--- a/PG-PuReMD/src/cuda_helpers.h
+++ b/PG-PuReMD/src/cuda_helpers.h
@@ -27,7 +27,7 @@ CUDA_DEVICE static inline int cuda_strcmp (char *a, char *b, int len)
 }


-CUDA_DEVICE static inline real atomicAdd(real* address, real val)
+CUDA_DEVICE static inline real myatomicAdd(real* address, real val)
 {
    unsigned long long int* address_as_ull =
        (unsigned long long int*)address;
@@ -46,17 +46,17 @@ CUDA_DEVICE static inline real atomicAdd(real* address, real val)

 CUDA_DEVICE static inline void atomic_rvecAdd( rvec ret, rvec v )
 {
-    atomicAdd ( &ret[0], v[0] );
-    atomicAdd ( &ret[1], v[1] );
-    atomicAdd ( &ret[2], v[2] );
+    myatomicAdd ( &ret[0], v[0] );
+    myatomicAdd ( &ret[1], v[1] );
+    myatomicAdd ( &ret[2], v[2] );
 }


 CUDA_DEVICE static inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
 {
-    atomicAdd ( &ret[0], c * v[0] );
-    atomicAdd ( &ret[1], c * v[1] );
-    atomicAdd ( &ret[2], c * v[2] );
+    myatomicAdd ( &ret[0], c * v[0] );
+    myatomicAdd ( &ret[1], c * v[1] );
+    myatomicAdd ( &ret[2], c * v[2] );
 }

 #endif
--- a/PG-PuReMD/src/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda_torsion_angles.cu
@@ -402,7 +402,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms,
                                pbond_jk->ta_CdDelta += CEtors3;
                                bo_ij->Cdbo += (CEtors4 + CEconj1);
                                bo_jk->Cdbo += (CEtors5 + CEconj2);
-                                atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3));
+                                myatomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3));

                                if( control->virial == 0 ) {
                                    /* dcos_theta_ijk */

--- a/m4/ax_cuda.m4
+++ b/m4/ax_cuda.m4
@@ -86,7 +86,8 @@ then
 	AC_MSG_RESULT([nvcc version : $NVCC_VERSION])
 	
 	# test if architecture is 64 bits and NVCC version >= 2.3
-        libdir=lib
+        #libdir=lib #NOTE: was lib, but changed to lib64 for CUDA 8.0
+        libdir=lib64
 	if test "x$host_cpu" = xx86_64 ; then
 	   if test "x$NVCC_VERSION" \> "x2.2" ; then
              libdir=lib64
@@ -215,23 +216,23 @@ then
    NVCCFLAGS=" -deviceemu"
 fi
 #
-AS_IF([test "x$want_cuda" = xyes],
-    [AS_IF([test "x$NVCCFLAGS" = x],
-        [dnl generate CUDA code for broad spectrum of devices
-         dnl Note: cc 13 for Tesla
-         dnl Note: cc 20 for Fermi
-	 dnl Note: cc 30 for Kepler K10
-	 dnl Note: cc 35 for Kepler K20
-         NVCCFLAGS=["-gencode arch=compute_10,code=sm_10 \
- -gencode arch=compute_11,code=sm_11 \
- -gencode arch=compute_13,code=sm_13 \
- -gencode arch=compute_20,code=sm_20 \
- -gencode arch=compute_30,code=sm_30 \
- -gencode arch=compute_35,code=sm_35"]
-                ]
-             )
-            ]
-        )
+#AS_IF([test "x$want_cuda" = xyes],
+#    [AS_IF([test "x$NVCCFLAGS" = x],
+#        [dnl generate CUDA code for broad spectrum of devices
+#         dnl Note: cc 13 for Tesla
+#         dnl Note: cc 20 for Fermi
+#	 dnl Note: cc 30 for Kepler K10
+#	 dnl Note: cc 35 for Kepler K20
+#         NVCCFLAGS=["-gencode arch=compute_10,code=sm_10 \
+# -gencode arch=compute_11,code=sm_11 \
+# -gencode arch=compute_13,code=sm_13 \
+# -gencode arch=compute_20,code=sm_20 \
+# -gencode arch=compute_30,code=sm_30 \
+# -gencode arch=compute_35,code=sm_35"]
+#                ]
+#             )
+#            ]
+#        )
 if test x$want_fast_math = xyes
 then
 	NVCCFLAGS+=" -use_fast_math"