diff --git a/.gitignore b/.gitignore
index 71b2700c04a2b27bf9d567332238467c0f89554c..83472db55e63edc69a5186b4ea999c4a7c9d4906 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.prs
 *.pot
 *.trj
+*.res*
 
 # TeX
 *.aux
diff --git a/cuda.am b/cuda.am
index eb61fb31d9cc6fa715f63ba5f8e57c505b723fae..b83b1e78a0f24153360af9f9ef04ab562b9c67ff 100644
--- a/cuda.am
+++ b/cuda.am
@@ -11,4 +11,4 @@ AM_V_NVCC_0 = @echo "  NVCC    " $@;
 AM_V_NVCC_1 =
 
 .cu.o:
-	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -o $@ -c $<
+	$(AM_V_NVCC)$(NVCC) $(AM_NVCCFLAGS) $(NVCCFLAGS) -o $@ -c $<
diff --git a/m4/ax_cuda.m4 b/m4/ax_cuda.m4
index 169632e774aa187ed9c67f877518eb7d843a10cb..6b7cfa72acefd7d5958d2871331f240ca12420c9 100644
--- a/m4/ax_cuda.m4
+++ b/m4/ax_cuda.m4
@@ -38,7 +38,7 @@
 #		AC_SUBST(CUDA_CFLAGS)
 #		AC_SUBST(CUDA_LIBS)
 #		AC_SUBST(NVCC)
-#		AC_SUBST(NVCCFLAGS)
+#		AC_SUBST(NFLAGS)
 #
 AC_DEFUN([AX_CUDA],
 [
@@ -61,6 +61,27 @@ AC_ARG_WITH([cuda],
 	want_cuda="yes"
 ])
 
+AC_ARG_ENABLE([cuda-fast-math],
+    AC_HELP_STRING([--enable-cuda-fast-math], [Turn on fast, less precise math functions in CUDA]),
+    [case "${enableval}" in
+        yes) CUDA_FAST_MATH=true ;;
+        no)  CUDA_FAST_MATH=false ;;
+        *)   AC_MSG_ERROR([bad value ${enableval} for --enable-cuda-fast-math]) ;;
+    esac],
+    [CUDA_FAST_MATH=false]
+)
+
+
+AC_ARG_ENABLE([emu],
+    AS_HELP_STRING([--enable-emu], [Turn on device emulation for CUDA]),
+    [case "${enableval}" in
+        yes) EMULATION=true ;;
+        no)  EMULATION=false ;;
+        *)   AC_MSG_ERROR([bad value ${enableval} for --enable-emu]) ;;
+    esac],
+    [EMULATION=false]
+)
+
 #AM_CONDITIONAL(USE_CUDA, test "x${want_cuda}" = xyes)
 
 if test "$want_cuda" = "yes"
@@ -141,16 +162,20 @@ then
 		AC_LANG_PROGRAM([@%:@include <cuda.h>],
 		[
 			CUmodule cuModule;
-			cuModuleLoad(&cuModule, "myModule.cubin");
 			CUdeviceptr devPtr;
 			CUfunction cuFunction;
 			size_t pitch, width = 250, height = 500;
-			cuMemAllocPitch(&devPtr, &pitch,width * sizeof(float), height, 4);
-			cuModuleGetFunction(&cuFunction, cuModule, "myKernel");
-			cuFuncSetBlockShape(cuFunction, 512, 1, 1);
-			cuParamSeti(cuFunction, 0, devPtr);
-			cuParamSetSize(cuFunction, sizeof(devPtr));
-			cuLaunchGrid(cuFunction, 100, 1);
+
+			void main()
+			{
+				cuModuleLoad(&cuModule, "myModule.cubin");
+				cuMemAllocPitch(&devPtr, &pitch,width * sizeof(float), height, 4);
+				cuModuleGetFunction(&cuFunction, cuModule, "myKernel");
+				cuFuncSetBlockShape(cuFunction, 512, 1, 1);
+				cuParamSeti(cuFunction, 0, devPtr);
+				cuParamSetSize(cuFunction, sizeof(devPtr));
+				cuLaunchGrid(cuFunction, 100, 1);
+			}
 		])
 	],
 	[
@@ -175,68 +200,19 @@ then
 	fi
 fi
 
-AC_SUBST(CUDA_CFLAGS)
-AC_SUBST(CUDA_LIBS)
-AC_SUBST(NVCC)
-
-AC_ARG_WITH([cuda-fast-math],
-	[AC_HELP_STRING([--with-cuda-fast-math],
-		[Tell nvcc to use -use_fast_math flag])],
-	[
-		if test "$withval" = "no"
-		then
-			want_fast_math="no"
-		elif test "$withval" = "yes"
-		then
-			want_fast_math="yes"
-		else
-			with_fast_math="$withval"
-			want_fast_math="yes"
-		fi
-	 ],
-         [
-		want_fast_math="yes"
-	 ]
-)
-
-
-AC_ARG_ENABLE([emu],
-    AS_HELP_STRING([--enable-emu], [Turn on device emulation for CUDA]),
-    [case "${enableval}" in
-        yes) EMULATION=true ;;
-        no)  EMULATION=false ;;
-        *)   AC_MSG_ERROR([bad value ${enableval} for --enable-emu]) ;;
-    esac],
-    [EMULATION=false]
-)
-
-# default nvcc flags
 if test x$EMULATION = xtrue
 then
-    NVCCFLAGS=" -deviceemu"
+    NFLAGS+=" -deviceemu"
 fi
  
-#AS_IF([test "x$want_cuda" = xyes],
-#    [AS_IF([test "x$NVCCFLAGS" = x],
-#        [dnl generate CUDA code for broad spectrum of devices
-#         dnl Note: cc 13 for Tesla
-#         dnl Note: cc 20 for Fermi
-#	 dnl Note: cc 30 for Kepler K10
-#	 dnl Note: cc 35 for Kepler K20
-#         NVCCFLAGS=["-gencode arch=compute_10,code=sm_10 \
-# -gencode arch=compute_11,code=sm_11 \
-# -gencode arch=compute_13,code=sm_13 \
-# -gencode arch=compute_20,code=sm_20 \
-# -gencode arch=compute_30,code=sm_30 \
-# -gencode arch=compute_35,code=sm_35"]
-#                ]
-#             )
-#            ]
-#        )
-if test x$want_fast_math = xyes
+if test x$CUDA_FAST_MATH = xtrue
 then
-	NVCCFLAGS+=" -use_fast_math"
+	NFLAGS+=" -use_fast_math"
 fi
-AC_MSG_NOTICE([Using NVCCFLAGS=$NVCCFLAGS])
-AC_SUBST(NVCCFLAGS)
+AC_MSG_NOTICE([Using NFLAGS=$NFLAGS])
+
+AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_LIBS)
+AC_SUBST(NVCC)
+AC_SUBST(NFLAGS)
 ])