diff --git a/src/cuda-packet-batcher.c b/src/cuda-packet-batcher.c
index 1259248bc2..26b8e34e4a 100644
--- a/src/cuda-packet-batcher.c
+++ b/src/cuda-packet-batcher.c
@@ -50,6 +50,7 @@
 #include "detect-parse.h"
 #include "tm-threads.h"
 #include "tmqh-packetpool.h"
+#include "util-mpm.h"
 
 /* \todo Make this user configurable through our yaml file.  Also provide options
  * where this can be dynamically updated based on the traffic */
@@ -80,7 +81,10 @@ static int run_batcher = 1;
  * on the traffic
  * \todo make this user configurable, as well allow dynamic update of this
  * variable based on the traffic seen */
-static uint32_t buffer_packet_threshhold = 2400;
+static uint32_t buffer_packet_threshhold = 0;
+
+/* the profile used by the cuda batcher */
+static MpmCudaConf *profile = NULL;
 
 /* flag used by the SIG_ALRM handler to indicate that the batcher TM should queue
  * the buffer to be processed by the Cuda Mpm B2g Batcher Thread for further
@@ -302,10 +306,10 @@ void *SCCudaPBTmThreadsSlot1(void *td)
              * tm-threads.c and this custom Slot1 function is this call
              * here.  We need to make the call here, even if we don't
              * receive a packet from the previous stage in the runmodes.
-             * This is needed in cases where we the SIG_ALRM handler
+             * This is needed in cases where the SIG_ALRM handler
              * wants us to queue the buffer to the GPU and ends up waking
              * the Batcher TM(which is waiting on a cond from the previous
-             * feeder TM).  Please handler the NULL packet case in the
+             * feeder TM).  Please handle the NULL packet case in the
              * function that you now call */
             r = s->s.SlotFunc(tv, p, s->s.slot_data, NULL, NULL);
         } else {
@@ -400,8 +404,8 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
     }
 
     /* the buffer for the packets to be sent over to the gpu.  We allot space for
-     * a minimum of SC_CUDA_PB_MIN_NO_OF_PACKETS, i.e. if each packet buffered
-     * is full to the brim */
+     * profile->packet_buffer_limit packets, assuming a size of
+     * profile->packet_size_limit for each packet */
     SCCudaHlModuleData *data = NULL;
     data = SCCudaHlGetModuleData(SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER"));
     if (data == NULL) {
@@ -412,7 +416,7 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
         return NULL;
     }
 
-    if (SCCudaHlGetCudaContext(&data->cuda_context, data->handle) == -1) {
+    if (SCCudaHlGetCudaContext(&data->cuda_context, "mpm", data->handle) == -1) {
         SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Error getting cuda context");
         return NULL;
     }
@@ -422,38 +426,67 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
                    "Error pushing cuda context to allocate memory");
     }
 
-    if (SCCudaMemHostAlloc((void**)&pb->packets_buffer,
-                           sizeof(SCCudaPBPacketDataForGPU) *
-                           SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    if (profile->page_locked) {
+        if (SCCudaMemHostAlloc((void**)&pb->packets_buffer,
+                               profile->packet_buffer_limit *
+                               (profile->packet_size_limit +
+                                sizeof(SCCudaPBPacketDataForGPUNonPayload)),
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_buffer = malloc(profile->packet_buffer_limit *
+                                    (profile->packet_size_limit +
+                                     sizeof(SCCudaPBPacketDataForGPUNonPayload)));
+        if (pb->packets_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
     }
-    memset(pb->packets_buffer, 0, sizeof(SCCudaPBPacketDataForGPU) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
-
-    /* used to hold the offsets of the buffered packets in the packets_buffer */
-    if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer,
-                           sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    memset(pb->packets_buffer, 0, profile->packet_buffer_limit *
+           (profile->packet_size_limit + sizeof(SCCudaPBPacketDataForGPUNonPayload)));
+
+    if (profile->page_locked) {
+        /* used to hold the offsets of the buffered packets in the packets_buffer */
+        if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer,
+                               sizeof(uint32_t) * profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_offset_buffer = malloc(sizeof(uint32_t) *
+                                           profile->packet_buffer_limit);
+        if (pb->packets_offset_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
     }
-    memset(pb->packets_offset_buffer, 0, sizeof(uint32_t) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
-
-    /* used to hold the offsets of the packets payload */
-    if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer,
-                           sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    memset(pb->packets_offset_buffer, 0,
+           sizeof(uint32_t) * profile->packet_buffer_limit);
+
+    if (profile->page_locked) {
+        /* used to hold the offsets of the packets payload */
+        if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer,
+                               sizeof(uint32_t) * profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) *
+                                                   profile->packet_buffer_limit);
+        if (pb->packets_payload_offset_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
     }
-    memset(pb->packets_payload_offset_buffer, 0, sizeof(uint32_t) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    memset(pb->packets_payload_offset_buffer, 0,
+           sizeof(uint32_t) * profile->packet_buffer_limit);
 
     SCLogDebug("Allocated pagelocked CUDA memory");
     if (SCCudaCtxPopCurrent(NULL) == -1) {
@@ -463,13 +496,13 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
     /* used to hold the packet addresses for all the packets buffered inside
      * packets_buffer */
     pb->packets_address_buffer = malloc(sizeof(Packet *) *
-                                        SC_CUDA_PB_MIN_NO_OF_PACKETS);
+                                        profile->packet_buffer_limit);
     if (pb->packets_address_buffer == NULL) {
         SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
         exit(EXIT_FAILURE);
     }
     memset(pb->packets_address_buffer, 0, sizeof(Packet *) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+           profile->packet_buffer_limit);
 
     return pb;
 }
@@ -541,7 +574,7 @@ TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data)
     if (!unittest_mode) {
         /* Set the alarm time limit during which the batcher thread would
          * buffer packets */
-        alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+        alarm(profile->batching_timeout);
     }
 
     return TM_ECODE_OK;
@@ -570,11 +603,13 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
         SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
                    "buffer and reseting the alarm");
         queue_buffer = 0;
+        SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
+                   "buffer and reseting the alarm");
         SCCudaPBQueueBuffer(data);
         /* if we are running unittests, don't set the alarm handler.  It will only
          * cause a seg fault if the tests take too long */
         if (!unittest_mode) {
-            alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+            alarm(profile->batching_timeout);
         }
     }
 
@@ -730,7 +765,7 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
      * to queue the buffer */
     if ( (pb->nop_in_buffer == buffer_packet_threshhold) || queue_buffer) {
         queue_buffer = 0;
-        SCLogDebug("Either we have hit the threshold limit for packets(i.e.) we "
+        SCLogDebug("Either we have hit the threshold limit for packets(i.e. we "
                    "have %d packets limit) OR we have exceeded the buffering "
                    "time limit.  Buffering the packet buffer and reseting the "
                    "alarm.", buffer_packet_threshhold);
@@ -738,7 +773,7 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
         /* if we are running unittests, don't set the alarm handler.  It will only
          * cause a seg fault if the tests take too long */
         if (!unittest_mode) {
-            alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+            alarm(profile->batching_timeout);
         }
     }
 
@@ -826,6 +861,8 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
      * page-locked memory */
     SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");
 
+    profile = SCCudaHlGetProfile("mpm");
+
     /* allocate the packet buffer */
     /* \todo need to work out the right no of packet buffers that we need to
      * queue.  I doubt we will need more than 4(as long as we don't run it on
@@ -833,7 +870,15 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
      * new ones, when we run out of buffers, since malloc for a huge chunk
      * like this will take time.  We need to figure out a value based on
      * various other parameters like alarm time and buffer threshold value */
-    for (i = 0; i < 10; i++) {
+    for (i = 0; i < profile->packet_buffers; i++) {
+        if (profile->page_locked) {
+            SCLogDebug("Allocating \"%d\" page_locked cuda packet buffers",
+                       profile->packet_buffers);
+        } else {
+            SCLogDebug("Allocating \"%d\" non-page_locked cuda packet buffers",
+                       profile->packet_buffers);
+        }
+
         SCCudaPBPacketsBuffer *pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
         /* dump the buffer into the inqueue for this batcher TM.  the batcher
          * thread would be the first consumer for these buffers */
@@ -843,7 +888,7 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
     /* \todo This needs to be changed ASAP.  This can't exceed max_pending_packets.
      * Also we need to make this user configurable and allow dynamic updaes
      * based on live traffic */
-    buffer_packet_threshhold = 2400;
+    buffer_packet_threshhold = profile->packet_buffer_limit;
 
     return;
 }
@@ -870,21 +915,33 @@ void SCCudaPBCleanUpQueuesAndBuffers(void)
     SCMutexLock(&dq->mutex_q);
     while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
         if (pb->packets_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_buffer");
+                }
+            } else {
+                free(pb->packets_buffer);
             }
         }
         if (pb->packets_offset_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_offset_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_offset_buffer");
+                }
+            } else {
+                free(pb->packets_offset_buffer);
             }
         }
         if (pb->packets_payload_offset_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_payload_offset_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_payload_offset_buffer");
+                }
+            } else {
+                free(pb->packets_payload_offset_buffer);
             }
         }
 
@@ -939,6 +996,17 @@ void SCCudaPBSetBufferPacketThreshhold(uint32_t threshhold_override)
     return;
 }
 
+/**
+ * \brief Function used to set the profile for cuda packet batcher.  Used
+ *        for unittests alone.
+ */
+void SCCudaPBSetProfile(char *profile_name)
+{
+    profile = SCCudaHlGetProfile("mpm");
+
+    return;
+}
+
 /**
  * \brief Used to inform the cuda packet batcher that packet batching shouldn't
  *        be done anymore and set the flag to indicate this.  We also need to
diff --git a/src/cuda-packet-batcher.h b/src/cuda-packet-batcher.h
index 2c53c4520a..587476859b 100644
--- a/src/cuda-packet-batcher.h
+++ b/src/cuda-packet-batcher.h
@@ -138,6 +138,7 @@ void TmModuleCudaPacketBatcherRegister(void);
 void *SCCudaPBTmThreadsSlot1(void *);
 
 void SCCudaPBRunningTests(int);
+void SCCudaPBSetProfile(char *);
 
 #endif /* __SC_CUDA_SUPPORT__ */
 
diff --git a/src/detect.c b/src/detect.c
index de8a9c80fb..3cf2d0dec4 100644
--- a/src/detect.c
+++ b/src/detect.c
@@ -3732,7 +3732,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) {
     de_ctx->cuda_rc_mod_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
     if (de_ctx->mpm_matcher == MPM_B2G_CUDA) {
         CUcontext dummy_context;
-        if (SCCudaHlGetCudaContext(&dummy_context,
+        if (SCCudaHlGetCudaContext(&dummy_context, "mpm",
                                    de_ctx->cuda_rc_mod_handle) == -1) {
             SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                        "module SC_RULES_CONTENT_B2G_CUDA");
diff --git a/src/suricata.c b/src/suricata.c
index 035f398795..e834a48ba1 100644
--- a/src/suricata.c
+++ b/src/suricata.c
@@ -792,6 +792,11 @@ int main(int argc, char **argv)
      * logging module. */
     SCLogLoadConfig();
 
+#ifdef __SC_CUDA_SUPPORT__
+    /* load the cuda configuration */
+    SCCudaHlGetYamlConf();
+#endif /* __SC_CUDA_SUPPORT__ */
+
     /* Load the Host-OS lookup. */
     SCHInfoLoadFromConfig();
 
diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c
index ae477b67ad..305c9cfed8 100644
--- a/src/util-cuda-handlers.c
+++ b/src/util-cuda-handlers.c
@@ -51,7 +51,7 @@
  *       cuda modules against a cuda_context, although it is highly unlikely we
  *       would need this feature.
  *
- *       We also need to use a mutex for module_datas.
+ *       We also need to use a mutex for module_data.
  */
 
 #include "suricata-common.h"
@@ -70,14 +70,123 @@
 #include "util-debug.h"
 #include "util-unittest.h"
 #include "packet-queue.h"
+#include "util-mpm.h"
 
 /* macros decides if cuda is enabled for the platform or not */
 #ifdef __SC_CUDA_SUPPORT__
 
-static SCCudaHlModuleData *module_datas = NULL;
+static SCCudaHlModuleData *module_data = NULL;
 
 static uint8_t module_handle = 1;
 
+/* holds the parsed cuda configuration from our yaml file */
+static SCCudaHlCudaProfile *cuda_profiles = NULL;
+
+/* used by unittests only */
+static SCCudaHlCudaProfile *backup_cuda_profiles = NULL;
+
+/**
+ * \brief Needed by unittests.  Backup the existing cuda profile in handlers.
+ */
+void SCCudaHlBackupRegisteredProfiles(void)
+{
+    backup_cuda_profiles = cuda_profiles;
+    cuda_profiles = NULL;
+
+    return;
+}
+
+/**
+ * \brief Needed by unittests.  Restore the previous backup of handlers'
+ *        cuda profile.
+ */
+void SCCudaHlRestoreBackupRegisteredProfiles(void)
+{
+    cuda_profiles = backup_cuda_profiles;
+
+    return;
+}
+
+/**
+ * \brief Parse the "cuda" subsection config from our conf file.
+ */
+void SCCudaHlGetYamlConf(void)
+{
+    SCCudaHlCudaProfile *profile = NULL;
+
+    /* "mpm" profile, found under "cuda.mpm" in the conf file */
+    profile = malloc(sizeof(SCCudaHlCudaProfile));
+    if (profile == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(profile, 0, sizeof(SCCudaHlCudaProfile));
+    profile->name = "mpm";
+    profile->data = MpmCudaConfParse();
+    if (cuda_profiles == NULL) {
+        cuda_profiles = profile;
+    } else {
+        profile->next = cuda_profiles;
+        cuda_profiles = profile;
+    }
+
+    return;
+}
+
+/**
+ * \brief Get a particular cuda profile specified as arg.
+ *
+ * \param profile_name Name of the the profile to retrieve.
+ *
+ * \retval Data associated with the profile.
+ */
+void *SCCudaHlGetProfile(char *profile_name)
+{
+    SCCudaHlCudaProfile *profile = cuda_profiles;
+
+    if (cuda_profiles == NULL ) {
+        SCLogInfo("No cuda profile registered");
+        return NULL;
+    }
+
+    if (profile_name == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "argument profile NULL");
+        return NULL;
+    }
+
+    while (profile != NULL && strcasecmp(profile->name, profile_name) != 0) {
+        profile = profile->next;
+    }
+
+    if (profile != NULL)
+        return profile->data;
+    else
+        return NULL;
+}
+
+/**
+ * \brief Clean the cuda profiles, held in cuda_profiles.
+ */
+void SCCudaHlCleanProfiles(void)
+{
+    SCCudaHlCudaProfile *profile = cuda_profiles;
+    SCCudaHlCudaProfile *profile_next = NULL;
+
+    while (profile != NULL) {
+        profile_next = profile->next;
+        if (profile->data != NULL) {
+            if (strcasecmp(profile->name, "mpm") == 0) {
+                MpmCudaConfCleanup(profile->data);
+            }
+        }
+        free(profile);
+        profile = profile_next;
+    }
+    cuda_profiles = NULL;
+
+    return;
+}
+
 /**
  * \internal
  * \brief Returns a SCCudaHlModuleData instance from the global data store
@@ -89,7 +198,7 @@ static uint8_t module_handle = 1;
  */
 SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
 
     if (data == NULL)
         return NULL;
@@ -189,15 +298,16 @@ static int SCCudaHlGetUniqueHandle(void)
  *        in the argument.  If a cuda_context is already present for
  *        a handle, it is returned.
  *
- * \param p_context Pointer to a cuda context instance that should be updated
- *                  with a cuda context.
- * \param handle    A unique handle which identifies a module.  Obtained from
- *                  a call to SCCudaHlGetUniqueHandle().
+ * \param p_context    Pointer to a cuda context instance that should be updated
+ *                     with a cuda context.
+ * \param cuda_profile The cuda profile, supplied as a string.
+ * \param handle       A unique handle which identifies a module.  Obtained from
+ *                     a call to SCCudaHlGetUniqueHandle().
  *
  * \retval  0 On success.
  * \retval -1 On failure.
  */
-int SCCudaHlGetCudaContext(CUcontext *p_context, int handle)
+int SCCudaHlGetCudaContext(CUcontext *p_context, char *cuda_profile, int handle)
 {
     SCCudaHlModuleData *data = NULL;
     SCCudaDevices *devices = NULL;
@@ -227,23 +337,23 @@ int SCCudaHlGetCudaContext(CUcontext *p_context, int handle)
         return 0;
     }
 
-    /* Get default log level and format. */
-    char *cuda_device_id_str = NULL;
-    int cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
-    if (ConfGet("cuda.device_id", &cuda_device_id_str) == 1) {
-        cuda_device_id = atoi(cuda_device_id_str);
-        if (!SCCudaIsCudaDeviceIdValid(cuda_device_id)) {
-            SCLogError(SC_ERR_CUDA_ERROR, "Invalid device id \"%s\" supplied "
-                       "in the conf file", cuda_device_id_str);
-            cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
+    int device_id = SC_CUDA_DEFAULT_DEVICE;
+    if (cuda_profile != NULL) {
+        /* Get default log level and format. */
+        MpmCudaConf *profile = SCCudaHlGetProfile(cuda_profile);
+        if (profile != NULL) {
+            if (SCCudaIsCudaDeviceIdValid(profile->device_id)) {
+                device_id = profile->device_id;
+            } else {
+                SCLogError(SC_ERR_CUDA_ERROR, "Invalid device id \"%d\" supplied.  "
+                           "Using the first device.", profile->device_id);
+            }
         }
-    } else {
-        cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
     }
 
     /* Get the device list for this CUDA platform and create a new cuda context */
     devices = SCCudaGetDeviceList();
-    if (SCCudaCtxCreate(p_context, 0, devices->devices[cuda_device_id]->device) == -1)
+    if (SCCudaCtxCreate(p_context, 0, devices->devices[device_id]->device) == -1)
         goto error;
     data->cuda_context = p_context[0];
 
@@ -565,7 +675,7 @@ int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int handle
  */
 const char *SCCudaHlGetModuleName(int handle)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
 
     while (data != NULL && data->handle != handle) {
         data = data->next;
@@ -587,7 +697,7 @@ const char *SCCudaHlGetModuleName(int handle)
  */
 int SCCudaHlGetModuleHandle(const char *name)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
 
     while (data != NULL &&
            strcmp(data->name, name) != 0) {
@@ -615,7 +725,7 @@ int SCCudaHlGetModuleHandle(const char *name)
  */
 int SCCudaHlRegisterModule(const char *name)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
     SCCudaHlModuleData *new_data = NULL;
 
     while (data != NULL &&
@@ -624,9 +734,8 @@ int SCCudaHlRegisterModule(const char *name)
     }
 
     if (data != NULL) {
-        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module \"%s\" already "
-                   "registered.  Returning the handle for the already "
-                   "registered module", name);
+        SCLogInfo("Module \"%s\" already registered.  Returning the handle "
+                  "for the already registered module", name);
         return data->handle;
     }
 
@@ -646,13 +755,13 @@ int SCCudaHlRegisterModule(const char *name)
     new_data->handle = SCCudaHlGetUniqueHandle();
 
     /* first module to be registered */
-    if (module_datas == NULL) {
-        module_datas = new_data;
+    if (module_data == NULL) {
+        module_data = new_data;
         return new_data->handle;
     }
 
     /* add this new module_data instance to the global module_data list */
-    data = module_datas;
+    data = module_data;
     while (data->next != NULL)
         data = data->next;
     data->next = new_data;
@@ -723,10 +832,10 @@ int SCCudaHlDeRegisterModule(const char *name)
     }
 
     /* find the previous module data instance */
-    if (module_datas == data) {
-        module_datas = module_datas->next;
+    if (module_data == data) {
+        module_data = module_data->next;
     } else {
-        prev_data = module_datas;
+        prev_data = module_data;
         while (prev_data->next != data)
             prev_data = prev_data->next;
         prev_data->next = data->next;
@@ -746,7 +855,7 @@ int SCCudaHlDeRegisterModule(const char *name)
  */
 void SCCudaHlDeRegisterAllRegisteredModules(void)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
     SCCudaHlModuleData *next_data = NULL;
 
     next_data = data;
@@ -759,7 +868,7 @@ void SCCudaHlDeRegisterAllRegisteredModules(void)
         data = next_data;
     }
 
-    module_datas = NULL;
+    module_data = NULL;
 
     return;
 }
@@ -805,7 +914,7 @@ int SCCudaHlTestEnvCudaContextInit(void)
 {
     CUcontext context;
     int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
-    if (SCCudaHlGetCudaContext(&context, module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&context, NULL, module_handle) == -1) {
         printf("Error getting a cuda context");
     }
     if (SCCudaHlPushCudaContextFromModule("SC_RULES_CONTENT_B2G_CUDA") == -1) {
diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h
index cae7d595dd..2d135abaaf 100644
--- a/src/util-cuda-handlers.h
+++ b/src/util-cuda-handlers.h
@@ -61,7 +61,25 @@ typedef struct SCCudaHlModuleData_ {
     struct SCCudaHlModuleData_ *next;
 } SCCudaHlModuleData;
 
-int SCCudaHlGetCudaContext(CUcontext *, int);
+/**
+ * \brief Used to hold the cuda configuration from our conf yaml file
+ */
+typedef struct SCCudaHlCudaProfile_ {
+    /* profile name.  Should be unique */
+    char *name;
+    /* the data associated with this profile */
+    void *data;
+
+    struct SCCudaHlCudaProfile_ *next;
+} SCCudaHlCudaProfile;
+
+void SCCudaHlGetYamlConf(void);
+void *SCCudaHlGetProfile(char *);
+void SCCudaHlCleanProfiles(void);
+void SCCudaHlBackupRegisteredProfiles(void);
+void SCCudaHlRestoreBackupRegisteredProfiles(void);
+
+int SCCudaHlGetCudaContext(CUcontext *, char *, int);
 int SCCudaHlGetCudaModule(CUmodule *, const char *, int);
 int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int);
 int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int);
diff --git a/src/util-mpm-b2g-cuda.c b/src/util-mpm-b2g-cuda.c
index 8b84d96333..b92303f6b6 100644
--- a/src/util-mpm-b2g-cuda.c
+++ b/src/util-mpm-b2g-cuda.c
@@ -1232,7 +1232,7 @@ void B2gCudaDestroyCtx(MpmCtx *mpm_ctx)
                    "module_data if we are having a module_handle");
         goto error;
     }
-    if (SCCudaHlGetCudaContext(&dummy_context, ctx->module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", ctx->module_handle) == -1) {
         SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                    "module %s", module_data->name);
         goto error;
@@ -1700,6 +1700,7 @@ typedef struct B2gCudaMpmThreadCtxData_ {
  */
 TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 {
+    MpmCudaConf *profile = NULL;
     SCCudaHlModuleData *module_data = (SCCudaHlModuleData *)initdata;
 
     if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
@@ -1718,7 +1719,7 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 
     tctx->b2g_cuda_module_handle = module_data->handle;
 
-    if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, module_data->handle) == -1) {
+    if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, "mpm", module_data->handle) == -1) {
         SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context");
         goto error;
     }
@@ -1777,19 +1778,35 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 
     tctx->b2g_cuda_search_kernel_arg_total = offset;
 
+    profile = SCCudaHlGetProfile("mpm");
+
     /* buffer to hold the b2g cuda mpm match results for 4000 packets.  The
-     * extra 2 bytes(the 1 in 1481 instead of 1480) is to hold the no of
-     * matches for the payload.  The remaining 1480 positions in the buffer
-     * is to hold the match offsets */
-    if (SCCudaMemHostAlloc((void**)&tctx->results_buffer, sizeof(uint16_t) * 1481 *
-                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
-        exit(EXIT_FAILURE);
+     * extra 2 bytes(the extra + 1 ) is to hold the no of
+     * matches for the payload.  The remaining profile->packet_size_limit
+     * positions in the buffer is to hold the match offsets */
+    if (profile->page_locked) {
+        if (SCCudaMemHostAlloc((void**)&tctx->results_buffer,
+                               sizeof(uint16_t) * (profile->packet_size_limit + 1) *
+                               profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE) == -1){
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        tctx->results_buffer = malloc(sizeof(uint16_t) *
+                                      (profile->packet_size_limit + 1) *
+                                      profile->packet_buffer_limit);
+        if (tctx->results_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
     }
 
     if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_results_buffer,
                                  "MPM_B2G_RESULTS",
-                                 sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint16_t) *
+                                 (profile->packet_size_limit + 1) *
+                                 profile->packet_buffer_limit,
                                  NULL, module_data->handle) == -1) {
         goto error;
     }
@@ -1802,22 +1819,23 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 
     if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_buffer,
                                  "MPM_B2G_PACKETS_BUFFER",
-                                 (sizeof(SCCudaPBPacketDataForGPU) *
-                                  SC_CUDA_PB_MIN_NO_OF_PACKETS),
+                                 profile->packet_buffer_limit *
+                                 (profile->packet_size_limit +
+                                  sizeof(SCCudaPBPacketDataForGPUNonPayload)),
                                  NULL, module_data->handle) == -1) {
         goto error;
     }
 
     if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_offset_buffer,
                                  "MPM_B2G_PACKETS_BUFFER_OFFSETS",
-                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint32_t) * profile->packet_buffer_limit,
                                  NULL, module_data->handle) == -1) {
         goto error;
     }
 
     if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_payload_offset_buffer,
                                  "MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS",
-                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint32_t) * profile->packet_buffer_limit,
                                  NULL, module_data->handle) == -1) {
         goto error;
     }
@@ -1882,6 +1900,7 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
 {
     B2gCudaMpmThreadCtxData *tctx = data;
+    MpmCudaConf *profile = NULL;
 
     if (tctx == NULL) {
         SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments.  data NULL\n");
@@ -1898,16 +1917,22 @@ TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
                    "module_data if we are having a module_handle");
         goto error;
     }
-    if (SCCudaHlGetCudaContext(&dummy_context, tctx->b2g_cuda_module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", tctx->b2g_cuda_module_handle) == -1) {
         SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                    "module %s", module_data->name);
         goto error;
     }
     SCCudaCtxPushCurrent(dummy_context);
 
-    if (SCCudaMemFreeHost(tctx->results_buffer) == -1)
-        SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                   "results_buffer\n");
+    profile = SCCudaHlGetProfile("mpm");
+    if (profile->page_locked) {
+        if (SCCudaMemFreeHost(tctx->results_buffer) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                       "results_buffer\n");
+        }
+    } else {
+        free(tctx->results_buffer);
+    }
     SCCudaHlFreeCudaDevicePtr("MPM_B2G_RESULTS", tctx->b2g_cuda_module_handle);
     SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER", tctx->b2g_cuda_module_handle);
     SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER_OFFSETS",
@@ -2291,7 +2316,7 @@ static int B2gCudaTest01(void)
 
     /* get the cuda context and push it */
     CUcontext dummy_context;
-    if (SCCudaHlGetCudaContext(&dummy_context, module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", module_handle) == -1) {
         SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                    "module SC_RULES_CONTENT_B2G_CUDA");
     }
@@ -2323,6 +2348,7 @@ static int B2gCudaTest01(void)
 
     result = 1;
 
+    SCCudaPBSetProfile("mpm");
     pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
     SCCudaPBPacketDataForGPU *curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer;
 
@@ -2500,6 +2526,7 @@ static int B2gCudaTest02(void)
     }
     SigGroupBuild(de_ctx);
 
+    SCCudaPBSetProfile("mpm");
     SCCudaPBSetUpQueuesAndBuffers();
 
     /* get the queues used by the batcher thread */
@@ -2800,6 +2827,7 @@ static int B2gCudaTest03(void)
     SigGroupBuild(de_ctx);
     DetectEngineThreadCtxInit(&de_tv, (void *)de_ctx, (void *)&det_ctx);
 
+    SCCudaPBSetProfile("mpm");
     SCCudaPBSetUpQueuesAndBuffers();
 
     /* get the queues used by the batcher thread */
diff --git a/src/util-mpm.c b/src/util-mpm.c
index c9f8cdb2ab..6d38ecb9ed 100644
--- a/src/util-mpm.c
+++ b/src/util-mpm.c
@@ -456,6 +456,717 @@ uint32_t MpmGetBloomSize(const char *conf_val)
     SCReturnInt(bloom_value);
 }
 
+
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Parse the "mpm" profile under the cuda subsection of our conf file.
+ *
+ * \retval profile Pointer to a struct containing the parsed data.
+ */
+MpmCudaConf *MpmCudaConfParse(void)
+{
+    ConfNode *cuda_node = NULL;
+    ConfNode *seq_node = NULL;
+
+    MpmCudaConf *profile = NULL;
+
+    const char *packet_buffer_limit = NULL;
+    const char *packet_size_limit = NULL;
+    const char *packet_buffers = NULL;
+    const char *batching_timeout = NULL;
+    const char *page_locked = NULL;
+    const char *device_id = NULL;
+
+    if ((profile = malloc(sizeof(MpmCudaConf))) == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(profile, 0, sizeof(MpmCudaConf));
+    profile->packet_buffer_limit = MPM_PACKET_BUFFER_LIMIT;
+    profile->packet_size_limit = MPM_PACKET_SIZE_LIMIT;
+    profile->packet_buffers = MPM_PACKET_BUFFERS;
+    profile->batching_timeout = MPM_BATCHING_TIMEOUT;
+    profile->page_locked = MPM_PAGE_LOCKED;
+    profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+
+    cuda_node = ConfGetNode("cuda");
+    if (cuda_node == NULL) {
+        SCLogInfo("No conf found for \"cuda\" in yaml file.  Use default conf");
+        goto end;
+    }
+
+    TAILQ_FOREACH(seq_node, &cuda_node->head, next) {
+        if (strcasecmp(seq_node->val, "mpm") == 0) {
+            packet_buffer_limit = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_buffer_limit");
+            packet_size_limit = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_size_limit");
+            packet_buffers = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_buffers");
+            batching_timeout = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "batching_timeout");
+            page_locked = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "page_locked");
+            device_id = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "device_id");
+
+            /* packet_buffer_size */
+            if (packet_buffer_limit == NULL || strcasecmp(packet_buffer_limit, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_buffer_limit.  Either NULL or empty");
+            } else {
+                profile->packet_buffer_limit = atoi(packet_buffer_limit);
+                if (profile->packet_buffer_limit <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_buffer_limit - %s", packet_buffer_limit);
+                    profile->packet_buffer_limit = MPM_PACKET_BUFFER_LIMIT;
+                }
+            }
+
+            /* packet_size_limit */
+            if (packet_size_limit == NULL || strcasecmp(packet_size_limit, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_size_limit.  Either NULL or empty");
+            } else {
+                profile->packet_size_limit = atoi(packet_size_limit);
+                if (profile->packet_size_limit <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_size_limit - %s", packet_size_limit);
+                    profile->packet_size_limit = MPM_PACKET_SIZE_LIMIT;
+                }
+            }
+
+            /* packet_buffers */
+            if (packet_buffers == NULL || strcasecmp(packet_buffers, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_buffers.  Either NULL or empty");
+            } else {
+                profile->packet_buffers = atoi(packet_buffers);
+                if (profile->packet_buffers <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_buffers - %s", packet_buffers);
+                    profile->packet_buffers = MPM_PACKET_BUFFERS;
+                }
+            }
+
+            /* batching_timeout */
+            if (batching_timeout == NULL || strcasecmp(batching_timeout, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.batching_timeout.  Either NULL or empty");
+            } else {
+                profile->batching_timeout = atoi(batching_timeout);
+                if (profile->batching_timeout <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.batching_timeout - %s", batching_timeout);
+                    profile->batching_timeout = MPM_BATCHING_TIMEOUT;
+                }
+            }
+
+            /* page_locked */
+            if (page_locked == NULL || strcasecmp(page_locked, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.page_locked.  Either NULL or empty");
+            } else {
+                if (strcasecmp(page_locked, "enabled") == 0) {
+                    profile->page_locked = MPM_PAGE_LOCKED;
+                } else if (strcasecmp(page_locked, "disabled") == 0) {
+                    profile->page_locked = !MPM_PAGE_LOCKED;
+                } else {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.page_locked - %s", page_locked);
+                }
+            }
+
+            /* device_id */
+            if (device_id == NULL || strcasecmp(device_id, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.device_id  Either NULL or empty");
+                profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+                continue;
+            } else {
+                profile->device_id = atoi(device_id);
+                if (profile->device_id < 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.device_id - %s", device_id);
+                    profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+                    continue;
+                }
+            }
+        } /* if (strcasecmp(seq_node->val, "mpm") == 0) */
+    } /* TAILQ_FOREACH(seq_node, &cuda_node->head, next) */
+
+ end:
+    SCLogDebug("Configuration for \"cuda.mpm\"\n"
+               "packet_buffer_size: %u\n"
+               "packet_size_limit: %d\n"
+               "packet_buffers: %d\n"
+               "batching_timeout: %d\n"
+               "page_locked: %d\n"
+               "device_id: %d\n",
+               profile->packet_buffer_limit, profile->packet_size_limit,
+               profile->packet_buffers, profile->batching_timeout,
+               profile->page_locked, profile->device_id);
+
+    return profile;
+}
+
+/**
+ * \brief Cleanup the parsed "mpm" profile cuda conf.
+ */
+void MpmCudaConfCleanup(MpmCudaConf *conf)
+{
+    if (conf != NULL)
+        free(conf);
+
+    return;
+}
+
+#endif /* __SC_CUDA_SUPPORT */
+
+/************************************Unittests*********************************/
+
+static int MpmInitYamlConf(char *conf)
+{
+    ConfCreateContextBackup();
+    ConfInit();
+    return ConfYamlLoadString(conf, strlen(conf));
+}
+
+static void MpmDeInitYamlConf(void)
+{
+    ConfDeInit();
+    ConfRestoreContextBackup();
+
+    return;
+}
+
+static int MpmTest01(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 4000\n"
+        "      packet_size_limit: 1500\n"
+        "      packet_buffers: 10\n"
+        "      batching_timeout: 1\n"
+        "      page_locked: enabled\n"
+        "      device_id: 0\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == 4000);
+    result &= (profile->packet_size_limit == 1500);
+    result &= (profile->packet_buffers == 10);
+    result &= (profile->batching_timeout == 1);
+    result &= (profile->page_locked == 1);
+    result &= (profile->device_id == 0);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest02(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 4001\n"
+        "      packet_size_limit: 1500\n"
+        "      packet_buffers: 12\n"
+        "      batching_timeout: 10\n"
+        "      page_locked: disabled\n"
+        "      device_id: 5\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == 4001);
+    result &= (profile->packet_size_limit == 1500);
+    result &= (profile->packet_buffers == 12);
+    result &= (profile->batching_timeout == 10);
+    result &= (profile->page_locked == 0);
+    result &= (profile->device_id == 5);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest03(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 0\n"
+        "      packet_size_limit: 0\n"
+        "      packet_buffers: 0\n"
+        "      batching_timeout: 0\n"
+        "      page_locked: enbled\n"
+        "      device_id: -1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest04(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: -1\n"
+        "      packet_size_limit: -1\n"
+        "      packet_buffers: -1\n"
+        "      batching_timeout: -1\n"
+        "      page_locked: enbled\n"
+        "      device_id: -1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest05(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit:\n"
+        "      packet_size_limit:\n"
+        "      packet_buffers:\n"
+        "      batching_timeout: 2\n"
+        "      page_locked: enabled\n"
+        "      device_id: 1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == 2);
+    result &= (profile->page_locked == 1);
+    result &= (profile->device_id == 1);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest06(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: \n"
+        "      packet_size_limit: \n"
+        "      packet_buffers: \n"
+        "      batching_timeout: \n"
+        "      page_locked: \n"
+        "      device_id: \n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest07(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit:\n"
+        "      packet_size_limit:\n"
+        "      packet_buffers:\n"
+        "      batching_timeout:\n"
+        "      page_locked:\n"
+        "      device_id:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest08(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_size_limit: 2000\n"
+        "      page_locked: disabled\n"
+        "      device_id: 4\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == 2000);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == !MPM_PAGE_LOCKED);
+    result &= (profile->device_id == 4);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest09(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest10(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest11(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
 void MpmRegisterTests(void) {
 #ifdef UNITTESTS
     uint16_t i;
@@ -467,6 +1178,17 @@ void MpmRegisterTests(void) {
             printf("Warning: mpm %s has no unittest registration function...", mpm_table[i].name);
         }
     }
+
+    UtRegisterTest("MpmTest01", MpmTest01, 1);
+    UtRegisterTest("MpmTest02", MpmTest02, 1);
+    UtRegisterTest("MpmTest03", MpmTest03, 1);
+    UtRegisterTest("MpmTest04", MpmTest04, 1);
+    UtRegisterTest("MpmTest05", MpmTest05, 1);
+    UtRegisterTest("MpmTest06", MpmTest06, 1);
+    UtRegisterTest("MpmTest07", MpmTest07, 1);
+    UtRegisterTest("MpmTest08", MpmTest08, 1);
+    UtRegisterTest("MpmTest09", MpmTest09, 1);
+    UtRegisterTest("MpmTest10", MpmTest10, 1);
+    UtRegisterTest("MpmTest11", MpmTest11, 1);
 #endif
 }
-
diff --git a/src/util-mpm.h b/src/util-mpm.h
index 289c852020..14c2126fbd 100644
--- a/src/util-mpm.h
+++ b/src/util-mpm.h
@@ -50,6 +50,13 @@
                                              pattern matcher algorithms */
 #define BLOOMSIZE_HIGH          2048    /**< High bloomfilter size for the multi
                                              pattern matcher algorithms */
+
+#define MPM_PACKET_BUFFER_LIMIT 2400
+#define MPM_PACKET_SIZE_LIMIT   1500
+#define MPM_PACKET_BUFFERS      10
+#define MPM_BATCHING_TIMEOUT    1
+#define MPM_PAGE_LOCKED         1
+
 enum {
     MPM_NOTSET = 0,
 
@@ -178,12 +185,35 @@ MpmCtx *MpmFactoryGetMpmCtxForProfile(int32_t);
 void MpmFactoryDeRegisterAllMpmCtxProfiles(void);
 int32_t MpmFactoryIsMpmCtxAvailable(MpmCtx *);
 
+/* macros decides if cuda is enabled for the platform or not */
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Cuda configuration for "mpm" profile.  We can further extend this
+ *        to have conf for specific mpms.  For now its common for all mpms.
+ */
+typedef struct MpmCudaConf_ {
+    int32_t packet_buffer_limit;
+    int16_t packet_size_limit;
+    int8_t packet_buffers;
+    int8_t batching_timeout;
+    int8_t page_locked;
+    int8_t device_id;
+} MpmCudaConf;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 int PmqSetup(PatternMatcherQueue *, uint32_t, uint32_t);
 void PmqMerge(PatternMatcherQueue *src, PatternMatcherQueue *dst);
 void PmqReset(PatternMatcherQueue *);
 void PmqCleanup(PatternMatcherQueue *);
 void PmqFree(PatternMatcherQueue *);
 
+#ifdef __SC_CUDA_SUPPORT__
+MpmCudaConf *MpmCudaConfParse(void);
+void MpmCudaConfCleanup(MpmCudaConf *);
+#endif /* __SC_CUDA_SUPPORT */
+
 void MpmTableSetup(void);
 void MpmRegisterTests(void);
 
@@ -197,4 +227,3 @@ uint32_t MpmGetHashSize(const char *);
 uint32_t MpmGetBloomSize(const char *);
 
 #endif /* __UTIL_MPM_H__ */
-
diff --git a/suricata.yaml b/suricata.yaml
index 71d02e36b7..d72d2300b5 100644
--- a/suricata.yaml
+++ b/suricata.yaml
@@ -152,11 +152,32 @@ threading:
   #
   detect_thread_ratio: 1.5
 
-# Select the cuda device to use.  The device_id identifies the device to be used
-# if one has multiple devices on the system.  To find out device_id associated
-#  with the card(s) on the system run "suricata --list-cuda-cards".
+# Cuda configuration.
 cuda:
-  device_id: 0
+  # The "mpm" profile.  On not specifying any of these parameters, the engine's
+  # internal default values are used, which are same as the ones specified here.
+  - mpm:
+      # Threshold limit for no of packets buffered to the GPU.  Once we hit this
+      # limit, we pass the buffer to the gpu.
+      packet_buffer_limit: 2400
+      # The maximum length for a packet that we would buffer to the gpu.
+      # Anything over this is MPM'ed on the CPU.  All entries > 0 are valid.
+      packet_size_limit: 1500
+      # No of packet buffers we initialize.  All entries > 0 are valid.
+      packet_buffers: 10
+      # The timeout limit for batching of packets in secs.  If we don't fill the
+      # buffer within this timeout limit, we pass the currently filled buffer to the gpu.
+      # All entries > 0 are valid.
+      batching_timeout: 1
+      # Specifies whether to use page_locked memory whereever possible.  Accepted values
+      # are "enabled" and "disabled".
+      page_locked: enabled
+      # The device to use for the mpm.  Currently we don't support load balancing
+      # on multiple gpus.  In case you have multiple devices on your system, you
+      # can specify the device to use, using this conf.  By default we hold 0, to
+      # specify the first device cuda sees.  To find out device_id associated with
+      # the card(s) on the system run "suricata --list-cuda-cards".
+      device_id: 0
 
 # Select the multi pattern algorithm you want to run for scan/search the
 # in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,