diff --git a/configure.in b/configure.in
index 5412fd2b9f..637f0be206 100644
--- a/configure.in
+++ b/configure.in
@@ -795,7 +795,7 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)])
         if test "$with_cuda_includes" != "no"; then
             CPPFLAGS="${CPPFLAGS} -I${with_cuda_includes}"
         else
-            CPPFLAGS="${CPPFLAGS} -I/usr/include/cuda"
+            CPPFLAGS="${CPPFLAGS} -I/usr/local/cuda/include"
         fi
 
         if test "$with_cuda_libraries" != "no"; then
@@ -819,24 +819,34 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)])
             exit 1
         fi
 
-        AC_PATH_PROG([NVCC], [nvcc], , [$PATH:$NVCC_DIR])
-        if test "$NVCC" = "no"; then
+        AC_PATH_PROG([NVCC], [nvcc], no, [$PATH:$NVCC_DIR])
+        if test "x$NVCC" = "xno"; then
             echo
             echo "   ERROR! CUDA nvcc compiler not found: use --with-cuda-nvcc=DIR"
             echo
             exit 1
         fi
 
-        AC_PATH_PROG([PYTHON], [python], no)
-        if test "$PYTHON" = "no"; then
+        AC_MSG_CHECKING(for nvcc version)
+        NVCCVER=`$NVCC --version | grep "release" | sed 's/.*release \(@<:@0-9@:>@\)\.\(@<:@0-9@:>@\).*/\1\2/'`
+        AC_MSG_RESULT($NVCCVER)
+        if test "$NVCCVER" -lt 31; then
             echo
-            echo "   ERROR! Compiling CUDA source requires python interpreter"
+            echo "   Warning! Your CUDA nvcc version might be outdated."
+            echo "   If compilation fails try the latest CUDA toolkit from"
+            echo "   www.nvidia.com/object/cuda_develop.html"
+            echo
+        fi
+
+        AM_PATH_PYTHON(,, no)
+        if test "x$PYTHON" = "xno"; then
+            echo
+            echo "   ERROR! Compiling CUDA kernels requires python."
             echo
             exit 1
         fi
     ])
-    AM_CONDITIONAL([BUILD_CUDA], [test "${NVCC}" != ""])
-
+    AM_CONDITIONAL([BUILD_CUDA], [test "x$enable_cuda" = "xyes"])
 
 # Check for libcap-ng
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 47061041e1..5735db0842 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -276,27 +276,43 @@ suricata_SOURCES += cuda-ptxdump.h
 suricata_CUDA_KERNELS = \
 util-mpm-b2g-cuda-kernel.cu
 
-SMVERSIONS = 10 11 12 13 20
 NVCCFLAGS=-O2
 
-# FIXME
-PTXS =
-PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_10.ptx)
-PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_11.ptx)
-PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_12.ptx)
-PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_13.ptx)
-PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_20.ptx)
+SUFFIXES = \
+.ptx_sm_10 \
+.ptx_sm_11 \
+.ptx_sm_12 \
+.ptx_sm_13 \
+.ptx_sm_20 \
+.ptx_sm_21
 
-# template to build for different compute capabilities
-define BUILDTEMPLATE
-# PTXS += $(patsubst %.cu, %_sm_$(1).ptx, $(suricata_CUDA_KERNELS))
-%_sm_$(1).ptx: %.cu
-	$(NVCC) $(NVCCFLAGS) -o $$@ -arch=sm_$(1) -ptx $$<
-endef
-$(foreach SMVER,$(SMVERSIONS),$(eval $(call BUILDTEMPLATE,$(SMVER))))
+PTXS =  $(suricata_CUDA_KERNELS:.cu=.ptx_sm_10)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_11)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_12)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_13)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
+
+.cu.ptx_sm_10:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_10 -ptx $<
+
+.cu.ptx_sm_11:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_11 -ptx $<
+
+.cu.ptx_sm_12:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_12 -ptx $<
+
+.cu.ptx_sm_13:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_13 -ptx $<
+
+.cu.ptx_sm_20:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_20 -ptx $<
+
+.cu.ptx_sm_21:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $<
 
 cuda-ptxdump.h: $(PTXS)
-	python ptxdump.py cuda-ptxdump $(PTXS)
+	$(PYTHON) ptxdump.py cuda-ptxdump $(PTXS)
 
 CLEANFILES = $(PTXS) cuda-ptxdump.h
 endif
diff --git a/src/ptxdump.py b/src/ptxdump.py
index 187c0c8fa6..097e517334 100644
--- a/src/ptxdump.py
+++ b/src/ptxdump.py
@@ -29,9 +29,9 @@ out_h = sys.argv[1] + ".h"
 out = open(out_h, 'w')
 
 out.writelines(header)
-out.writelines("#ifdef __SC_CUDA_SUPPORT__ \n")
-out.writelines("#ifndef __ptxdump_h__ \n")
-out.writelines("#define __ptxdump_h__ \n\n")
+out.writelines("#ifdef __SC_CUDA_SUPPORT__\n")
+out.writelines("#ifndef __ptxdump_h__\n")
+out.writelines("#define __ptxdump_h__\n\n")
 
 # write char arrays
 for file in sys.argv[2:]:
@@ -49,7 +49,7 @@ for file in sys.argv[2:]:
         if newlinecnt == 16:
             newlinecnt = 0
             out.write("\n")
-    out.write("0x00\n};\n")
+    out.write("0x00\n};\n\n")
 
     print(sys.argv[0] + ": CUmodule " + varname + " packed successfully")
 
@@ -62,8 +62,8 @@ out.writelines('\tSCLogError(SC_ERR_FATAL, "Error in SCCudaPtxDumpGetModule, mod
 out.writelines("\texit(EXIT_FAILURE);\n")
 out.writelines("};\n")
 
-out.writelines("#endif // __ptxdump_h__ \n")
-out.writelines("#endif // __SC_CUDA_SUPPORT__\n")
+out.writelines("#endif /* __ptxdump_h__ */\n")
+out.writelines("#endif /* __SC_CUDA_SUPPORT__ */\n")
 
 print(sys.argv[0] + ": " + out_h + " written successfully")