make cuda mpm parameters configurable

16 years ago · c734cd1bdd
parent 2c08aebec1
commit c734cd1bdd
10 changed files with 1115 additions and 114 deletions
--- a/src/cuda-packet-batcher.c
+++ b/src/cuda-packet-batcher.c
@ -50,6 +50,7 @@
 #include "detect-parse.h"
 #include "tm-threads.h"
 #include "tmqh-packetpool.h"
+#include "util-mpm.h"

 /* \todo Make this user configurable through our yaml file.  Also provide options
 * where this can be dynamically updated based on the traffic */
@ -80,7 +81,10 @@ static int run_batcher = 1;
 * on the traffic
 * \todo make this user configurable, as well allow dynamic update of this
 * variable based on the traffic seen */
-static uint32_t buffer_packet_threshhold = 2400;
+static uint32_t buffer_packet_threshhold = 0;
+
+/* the profile used by the cuda batcher */
+static MpmCudaConf *profile = NULL;

 /* flag used by the SIG_ALRM handler to indicate that the batcher TM should queue
 * the buffer to be processed by the Cuda Mpm B2g Batcher Thread for further
@ -302,10 +306,10 @@ void *SCCudaPBTmThreadsSlot1(void *td)
             * tm-threads.c and this custom Slot1 function is this call
             * here.  We need to make the call here, even if we don't
             * receive a packet from the previous stage in the runmodes.
-             * This is needed in cases where we the SIG_ALRM handler
+             * This is needed in cases where the SIG_ALRM handler
             * wants us to queue the buffer to the GPU and ends up waking
             * the Batcher TM(which is waiting on a cond from the previous
-             * feeder TM).  Please handler the NULL packet case in the
+             * feeder TM).  Please handle the NULL packet case in the
             * function that you now call */
            r = s->s.SlotFunc(tv, p, s->s.slot_data, NULL, NULL);
        } else {
@ -400,8 +404,8 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
    }

    /* the buffer for the packets to be sent over to the gpu.  We allot space for
-     * a minimum of SC_CUDA_PB_MIN_NO_OF_PACKETS, i.e. if each packet buffered
-     * is full to the brim */
+     * profile->packet_buffer_limit packets, assuming a size of
+     * profile->packet_size_limit for each packet */
    SCCudaHlModuleData *data = NULL;
    data = SCCudaHlGetModuleData(SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER"));
    if (data == NULL) {
@ -412,7 +416,7 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
        return NULL;
    }

-    if (SCCudaHlGetCudaContext(&data->cuda_context, data->handle) == -1) {
+    if (SCCudaHlGetCudaContext(&data->cuda_context, "mpm", data->handle) == -1) {
        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Error getting cuda context");
        return NULL;
    }
@ -422,38 +426,67 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
                   "Error pushing cuda context to allocate memory");
    }

-    if (SCCudaMemHostAlloc((void**)&pb->packets_buffer,
-                           sizeof(SCCudaPBPacketDataForGPU) *
-                           SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    if (profile->page_locked) {
+        if (SCCudaMemHostAlloc((void**)&pb->packets_buffer,
+                               profile->packet_buffer_limit *
+                               (profile->packet_size_limit +
+                                sizeof(SCCudaPBPacketDataForGPUNonPayload)),
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_buffer = malloc(profile->packet_buffer_limit *
+                                    (profile->packet_size_limit +
+                                     sizeof(SCCudaPBPacketDataForGPUNonPayload)));
+        if (pb->packets_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
    }
-    memset(pb->packets_buffer, 0, sizeof(SCCudaPBPacketDataForGPU) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
-
-    /* used to hold the offsets of the buffered packets in the packets_buffer */
-    if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer,
-                           sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    memset(pb->packets_buffer, 0, profile->packet_buffer_limit *
+           (profile->packet_size_limit + sizeof(SCCudaPBPacketDataForGPUNonPayload)));
+
+    if (profile->page_locked) {
+        /* used to hold the offsets of the buffered packets in the packets_buffer */
+        if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer,
+                               sizeof(uint32_t) * profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_offset_buffer = malloc(sizeof(uint32_t) *
+                                           profile->packet_buffer_limit);
+        if (pb->packets_offset_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
    }
-    memset(pb->packets_offset_buffer, 0, sizeof(uint32_t) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
-
-    /* used to hold the offsets of the packets payload */
-    if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer,
-                           sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
-                           CU_MEMHOSTALLOC_PORTABLE |
-                           CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
-        exit(EXIT_FAILURE);
+    memset(pb->packets_offset_buffer, 0,
+           sizeof(uint32_t) * profile->packet_buffer_limit);
+
+    if (profile->page_locked) {
+        /* used to hold the offsets of the packets payload */
+        if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer,
+                               sizeof(uint32_t) * profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE |
+                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) *
+                                                   profile->packet_buffer_limit);
+        if (pb->packets_payload_offset_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
    }
-    memset(pb->packets_payload_offset_buffer, 0, sizeof(uint32_t) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    memset(pb->packets_payload_offset_buffer, 0,
+           sizeof(uint32_t) * profile->packet_buffer_limit);

    SCLogDebug("Allocated pagelocked CUDA memory");
    if (SCCudaCtxPopCurrent(NULL) == -1) {
@ -463,13 +496,13 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
    /* used to hold the packet addresses for all the packets buffered inside
     * packets_buffer */
    pb->packets_address_buffer = malloc(sizeof(Packet *) *
-                                        SC_CUDA_PB_MIN_NO_OF_PACKETS);
+                                        profile->packet_buffer_limit);
    if (pb->packets_address_buffer == NULL) {
        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
        exit(EXIT_FAILURE);
    }
    memset(pb->packets_address_buffer, 0, sizeof(Packet *) *
-           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+           profile->packet_buffer_limit);

    return pb;
 }
@ -541,7 +574,7 @@ TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data)
    if (!unittest_mode) {
        /* Set the alarm time limit during which the batcher thread would
         * buffer packets */
-        alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+        alarm(profile->batching_timeout);
    }

    return TM_ECODE_OK;
@ -570,11 +603,13 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
        SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
                   "buffer and reseting the alarm");
        queue_buffer = 0;
+        SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
+                   "buffer and reseting the alarm");
        SCCudaPBQueueBuffer(data);
        /* if we are running unittests, don't set the alarm handler.  It will only
         * cause a seg fault if the tests take too long */
        if (!unittest_mode) {
-            alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+            alarm(profile->batching_timeout);
        }
    }

@ -730,7 +765,7 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
     * to queue the buffer */
    if ( (pb->nop_in_buffer == buffer_packet_threshhold) || queue_buffer) {
        queue_buffer = 0;
-        SCLogDebug("Either we have hit the threshold limit for packets(i.e.) we "
+        SCLogDebug("Either we have hit the threshold limit for packets(i.e. we "
                   "have %d packets limit) OR we have exceeded the buffering "
                   "time limit.  Buffering the packet buffer and reseting the "
                   "alarm.", buffer_packet_threshhold);
@ -738,7 +773,7 @@ TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue
        /* if we are running unittests, don't set the alarm handler.  It will only
         * cause a seg fault if the tests take too long */
        if (!unittest_mode) {
-            alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+            alarm(profile->batching_timeout);
        }
    }

@ -826,6 +861,8 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
     * page-locked memory */
    SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");

+    profile = SCCudaHlGetProfile("mpm");
+
    /* allocate the packet buffer */
    /* \todo need to work out the right no of packet buffers that we need to
     * queue.  I doubt we will need more than 4(as long as we don't run it on
@ -833,7 +870,15 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
     * new ones, when we run out of buffers, since malloc for a huge chunk
     * like this will take time.  We need to figure out a value based on
     * various other parameters like alarm time and buffer threshold value */
-    for (i = 0; i < 10; i++) {
+    for (i = 0; i < profile->packet_buffers; i++) {
+        if (profile->page_locked) {
+            SCLogDebug("Allocating \"%d\" page_locked cuda packet buffers",
+                       profile->packet_buffers);
+        } else {
+            SCLogDebug("Allocating \"%d\" non-page_locked cuda packet buffers",
+                       profile->packet_buffers);
+        }
+
        SCCudaPBPacketsBuffer *pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
        /* dump the buffer into the inqueue for this batcher TM.  the batcher
         * thread would be the first consumer for these buffers */
@ -843,7 +888,7 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
    /* \todo This needs to be changed ASAP.  This can't exceed max_pending_packets.
     * Also we need to make this user configurable and allow dynamic updaes
     * based on live traffic */
-    buffer_packet_threshhold = 2400;
+    buffer_packet_threshhold = profile->packet_buffer_limit;

    return;
 }
@ -870,21 +915,33 @@ void SCCudaPBCleanUpQueuesAndBuffers(void)
    SCMutexLock(&dq->mutex_q);
    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
        if (pb->packets_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_buffer");
+                }
+            } else {
+                free(pb->packets_buffer);
            }
        }
        if (pb->packets_offset_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_offset_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_offset_buffer");
+                }
+            } else {
+                free(pb->packets_offset_buffer);
            }
        }
        if (pb->packets_payload_offset_buffer != NULL) {
-            if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
-                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                           "packets_payload_offset_buffer");
+            if (profile->page_locked) {
+                if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
+                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                               "packets_payload_offset_buffer");
+                }
+            } else {
+                free(pb->packets_payload_offset_buffer);
            }
        }

@ -939,6 +996,17 @@ void SCCudaPBSetBufferPacketThreshhold(uint32_t threshhold_override)
    return;
 }

+/**
+ * \brief Function used to set the profile for cuda packet batcher.  Used
+ *        for unittests alone.
+ */
+void SCCudaPBSetProfile(char *profile_name)
+{
+    profile = SCCudaHlGetProfile("mpm");
+
+    return;
+}
+
 /**
 * \brief Used to inform the cuda packet batcher that packet batching shouldn't
 *        be done anymore and set the flag to indicate this.  We also need to
--- a/src/cuda-packet-batcher.h
+++ b/src/cuda-packet-batcher.h
@ -138,6 +138,7 @@ void TmModuleCudaPacketBatcherRegister(void);
 void *SCCudaPBTmThreadsSlot1(void *);

 void SCCudaPBRunningTests(int);
+void SCCudaPBSetProfile(char *);

 #endif /* __SC_CUDA_SUPPORT__ */

--- a/src/detect.c
+++ b/src/detect.c
@ -3732,7 +3732,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) {
    de_ctx->cuda_rc_mod_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
    if (de_ctx->mpm_matcher == MPM_B2G_CUDA) {
        CUcontext dummy_context;
-        if (SCCudaHlGetCudaContext(&dummy_context,
+        if (SCCudaHlGetCudaContext(&dummy_context, "mpm",
                                   de_ctx->cuda_rc_mod_handle) == -1) {
            SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                       "module SC_RULES_CONTENT_B2G_CUDA");
--- a/src/suricata.c
+++ b/src/suricata.c
@ -792,6 +792,11 @@ int main(int argc, char **argv)
     * logging module. */
    SCLogLoadConfig();

+#ifdef __SC_CUDA_SUPPORT__
+    /* load the cuda configuration */
+    SCCudaHlGetYamlConf();
+#endif /* __SC_CUDA_SUPPORT__ */
+
    /* Load the Host-OS lookup. */
    SCHInfoLoadFromConfig();

--- a/src/util-cuda-handlers.c
+++ b/src/util-cuda-handlers.c
@ -51,7 +51,7 @@
 *       cuda modules against a cuda_context, although it is highly unlikely we
 *       would need this feature.
 *
- *       We also need to use a mutex for module_datas.
+ *       We also need to use a mutex for module_data.
 */

 #include "suricata-common.h"
@ -70,14 +70,123 @@
 #include "util-debug.h"
 #include "util-unittest.h"
 #include "packet-queue.h"
+#include "util-mpm.h"

 /* macros decides if cuda is enabled for the platform or not */
 #ifdef __SC_CUDA_SUPPORT__

-static SCCudaHlModuleData *module_datas = NULL;
+static SCCudaHlModuleData *module_data = NULL;

 static uint8_t module_handle = 1;

+/* holds the parsed cuda configuration from our yaml file */
+static SCCudaHlCudaProfile *cuda_profiles = NULL;
+
+/* used by unittests only */
+static SCCudaHlCudaProfile *backup_cuda_profiles = NULL;
+
+/**
+ * \brief Needed by unittests.  Backup the existing cuda profile in handlers.
+ */
+void SCCudaHlBackupRegisteredProfiles(void)
+{
+    backup_cuda_profiles = cuda_profiles;
+    cuda_profiles = NULL;
+
+    return;
+}
+
+/**
+ * \brief Needed by unittests.  Restore the previous backup of handlers'
+ *        cuda profile.
+ */
+void SCCudaHlRestoreBackupRegisteredProfiles(void)
+{
+    cuda_profiles = backup_cuda_profiles;
+
+    return;
+}
+
+/**
+ * \brief Parse the "cuda" subsection config from our conf file.
+ */
+void SCCudaHlGetYamlConf(void)
+{
+    SCCudaHlCudaProfile *profile = NULL;
+
+    /* "mpm" profile, found under "cuda.mpm" in the conf file */
+    profile = malloc(sizeof(SCCudaHlCudaProfile));
+    if (profile == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(profile, 0, sizeof(SCCudaHlCudaProfile));
+    profile->name = "mpm";
+    profile->data = MpmCudaConfParse();
+    if (cuda_profiles == NULL) {
+        cuda_profiles = profile;
+    } else {
+        profile->next = cuda_profiles;
+        cuda_profiles = profile;
+    }
+
+    return;
+}
+
+/**
+ * \brief Get a particular cuda profile specified as arg.
+ *
+ * \param profile_name Name of the the profile to retrieve.
+ *
+ * \retval Data associated with the profile.
+ */
+void *SCCudaHlGetProfile(char *profile_name)
+{
+    SCCudaHlCudaProfile *profile = cuda_profiles;
+
+    if (cuda_profiles == NULL ) {
+        SCLogInfo("No cuda profile registered");
+        return NULL;
+    }
+
+    if (profile_name == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "argument profile NULL");
+        return NULL;
+    }
+
+    while (profile != NULL && strcasecmp(profile->name, profile_name) != 0) {
+        profile = profile->next;
+    }
+
+    if (profile != NULL)
+        return profile->data;
+    else
+        return NULL;
+}
+
+/**
+ * \brief Clean the cuda profiles, held in cuda_profiles.
+ */
+void SCCudaHlCleanProfiles(void)
+{
+    SCCudaHlCudaProfile *profile = cuda_profiles;
+    SCCudaHlCudaProfile *profile_next = NULL;
+
+    while (profile != NULL) {
+        profile_next = profile->next;
+        if (profile->data != NULL) {
+            if (strcasecmp(profile->name, "mpm") == 0) {
+                MpmCudaConfCleanup(profile->data);
+            }
+        }
+        free(profile);
+        profile = profile_next;
+    }
+    cuda_profiles = NULL;
+
+    return;
+}
+
 /**
 * \internal
 * \brief Returns a SCCudaHlModuleData instance from the global data store
@ -89,7 +198,7 @@ static uint8_t module_handle = 1;
 */
 SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;

    if (data == NULL)
        return NULL;
@ -189,15 +298,16 @@ static int SCCudaHlGetUniqueHandle(void)
 *        in the argument.  If a cuda_context is already present for
 *        a handle, it is returned.
 *
- * \param p_context Pointer to a cuda context instance that should be updated
- *                  with a cuda context.
- * \param handle    A unique handle which identifies a module.  Obtained from
- *                  a call to SCCudaHlGetUniqueHandle().
+ * \param p_context    Pointer to a cuda context instance that should be updated
+ *                     with a cuda context.
+ * \param cuda_profile The cuda profile, supplied as a string.
+ * \param handle       A unique handle which identifies a module.  Obtained from
+ *                     a call to SCCudaHlGetUniqueHandle().
 *
 * \retval  0 On success.
 * \retval -1 On failure.
 */
-int SCCudaHlGetCudaContext(CUcontext *p_context, int handle)
+int SCCudaHlGetCudaContext(CUcontext *p_context, char *cuda_profile, int handle)
 {
    SCCudaHlModuleData *data = NULL;
    SCCudaDevices *devices = NULL;
@ -227,23 +337,23 @@ int SCCudaHlGetCudaContext(CUcontext *p_context, int handle)
        return 0;
    }

-    /* Get default log level and format. */
-    char *cuda_device_id_str = NULL;
-    int cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
-    if (ConfGet("cuda.device_id", &cuda_device_id_str) == 1) {
-        cuda_device_id = atoi(cuda_device_id_str);
-        if (!SCCudaIsCudaDeviceIdValid(cuda_device_id)) {
-            SCLogError(SC_ERR_CUDA_ERROR, "Invalid device id \"%s\" supplied "
-                       "in the conf file", cuda_device_id_str);
-            cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
+    int device_id = SC_CUDA_DEFAULT_DEVICE;
+    if (cuda_profile != NULL) {
+        /* Get default log level and format. */
+        MpmCudaConf *profile = SCCudaHlGetProfile(cuda_profile);
+        if (profile != NULL) {
+            if (SCCudaIsCudaDeviceIdValid(profile->device_id)) {
+                device_id = profile->device_id;
+            } else {
+                SCLogError(SC_ERR_CUDA_ERROR, "Invalid device id \"%d\" supplied.  "
+                           "Using the first device.", profile->device_id);
+            }
        }
-    } else {
-        cuda_device_id = SC_CUDA_DEFAULT_DEVICE;
    }

    /* Get the device list for this CUDA platform and create a new cuda context */
    devices = SCCudaGetDeviceList();
-    if (SCCudaCtxCreate(p_context, 0, devices->devices[cuda_device_id]->device) == -1)
+    if (SCCudaCtxCreate(p_context, 0, devices->devices[device_id]->device) == -1)
        goto error;
    data->cuda_context = p_context[0];

@ -565,7 +675,7 @@ int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int handle
 */
 const char *SCCudaHlGetModuleName(int handle)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;

    while (data != NULL && data->handle != handle) {
        data = data->next;
@ -587,7 +697,7 @@ const char *SCCudaHlGetModuleName(int handle)
 */
 int SCCudaHlGetModuleHandle(const char *name)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;

    while (data != NULL &&
           strcmp(data->name, name) != 0) {
@ -615,7 +725,7 @@ int SCCudaHlGetModuleHandle(const char *name)
 */
 int SCCudaHlRegisterModule(const char *name)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
    SCCudaHlModuleData *new_data = NULL;

    while (data != NULL &&
@ -624,9 +734,8 @@ int SCCudaHlRegisterModule(const char *name)
    }

    if (data != NULL) {
-        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module \"%s\" already "
-                   "registered.  Returning the handle for the already "
-                   "registered module", name);
+        SCLogInfo("Module \"%s\" already registered.  Returning the handle "
+                  "for the already registered module", name);
        return data->handle;
    }

@ -646,13 +755,13 @@ int SCCudaHlRegisterModule(const char *name)
    new_data->handle = SCCudaHlGetUniqueHandle();

    /* first module to be registered */
-    if (module_datas == NULL) {
-        module_datas = new_data;
+    if (module_data == NULL) {
+        module_data = new_data;
        return new_data->handle;
    }

    /* add this new module_data instance to the global module_data list */
-    data = module_datas;
+    data = module_data;
    while (data->next != NULL)
        data = data->next;
    data->next = new_data;
@ -723,10 +832,10 @@ int SCCudaHlDeRegisterModule(const char *name)
    }

    /* find the previous module data instance */
-    if (module_datas == data) {
-        module_datas = module_datas->next;
+    if (module_data == data) {
+        module_data = module_data->next;
    } else {
-        prev_data = module_datas;
+        prev_data = module_data;
        while (prev_data->next != data)
            prev_data = prev_data->next;
        prev_data->next = data->next;
@ -746,7 +855,7 @@ int SCCudaHlDeRegisterModule(const char *name)
 */
 void SCCudaHlDeRegisterAllRegisteredModules(void)
 {
-    SCCudaHlModuleData *data = module_datas;
+    SCCudaHlModuleData *data = module_data;
    SCCudaHlModuleData *next_data = NULL;

    next_data = data;
@ -759,7 +868,7 @@ void SCCudaHlDeRegisterAllRegisteredModules(void)
        data = next_data;
    }

-    module_datas = NULL;
+    module_data = NULL;

    return;
 }
@ -805,7 +914,7 @@ int SCCudaHlTestEnvCudaContextInit(void)
 {
    CUcontext context;
    int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
-    if (SCCudaHlGetCudaContext(&context, module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&context, NULL, module_handle) == -1) {
        printf("Error getting a cuda context");
    }
    if (SCCudaHlPushCudaContextFromModule("SC_RULES_CONTENT_B2G_CUDA") == -1) {
--- a/src/util-cuda-handlers.h
+++ b/src/util-cuda-handlers.h
@ -61,7 +61,25 @@ typedef struct SCCudaHlModuleData_ {
    struct SCCudaHlModuleData_ *next;
 } SCCudaHlModuleData;

-int SCCudaHlGetCudaContext(CUcontext *, int);
+/**
+ * \brief Used to hold the cuda configuration from our conf yaml file
+ */
+typedef struct SCCudaHlCudaProfile_ {
+    /* profile name.  Should be unique */
+    char *name;
+    /* the data associated with this profile */
+    void *data;
+
+    struct SCCudaHlCudaProfile_ *next;
+} SCCudaHlCudaProfile;
+
+void SCCudaHlGetYamlConf(void);
+void *SCCudaHlGetProfile(char *);
+void SCCudaHlCleanProfiles(void);
+void SCCudaHlBackupRegisteredProfiles(void);
+void SCCudaHlRestoreBackupRegisteredProfiles(void);
+
+int SCCudaHlGetCudaContext(CUcontext *, char *, int);
 int SCCudaHlGetCudaModule(CUmodule *, const char *, int);
 int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int);
 int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int);
--- a/src/util-mpm-b2g-cuda.c
+++ b/src/util-mpm-b2g-cuda.c
@ -1232,7 +1232,7 @@ void B2gCudaDestroyCtx(MpmCtx *mpm_ctx)
                   "module_data if we are having a module_handle");
        goto error;
    }
-    if (SCCudaHlGetCudaContext(&dummy_context, ctx->module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", ctx->module_handle) == -1) {
        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                   "module %s", module_data->name);
        goto error;
@ -1700,6 +1700,7 @@ typedef struct B2gCudaMpmThreadCtxData_ {
 */
 TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 {
+    MpmCudaConf *profile = NULL;
    SCCudaHlModuleData *module_data = (SCCudaHlModuleData *)initdata;

    if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
@ -1718,7 +1719,7 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)

    tctx->b2g_cuda_module_handle = module_data->handle;

-    if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, module_data->handle) == -1) {
+    if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, "mpm", module_data->handle) == -1) {
        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context");
        goto error;
    }
@ -1777,19 +1778,35 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)

    tctx->b2g_cuda_search_kernel_arg_total = offset;

+    profile = SCCudaHlGetProfile("mpm");
+
    /* buffer to hold the b2g cuda mpm match results for 4000 packets.  The
-     * extra 2 bytes(the 1 in 1481 instead of 1480) is to hold the no of
-     * matches for the payload.  The remaining 1480 positions in the buffer
-     * is to hold the match offsets */
-    if (SCCudaMemHostAlloc((void**)&tctx->results_buffer, sizeof(uint16_t) * 1481 *
-                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
-        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
-        exit(EXIT_FAILURE);
+     * extra 2 bytes(the extra + 1 ) is to hold the no of
+     * matches for the payload.  The remaining profile->packet_size_limit
+     * positions in the buffer is to hold the match offsets */
+    if (profile->page_locked) {
+        if (SCCudaMemHostAlloc((void**)&tctx->results_buffer,
+                               sizeof(uint16_t) * (profile->packet_size_limit + 1) *
+                               profile->packet_buffer_limit,
+                               CU_MEMHOSTALLOC_PORTABLE) == -1){
+            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        tctx->results_buffer = malloc(sizeof(uint16_t) *
+                                      (profile->packet_size_limit + 1) *
+                                      profile->packet_buffer_limit);
+        if (tctx->results_buffer == NULL) {
+            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+            exit(EXIT_FAILURE);
+        }
    }

    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_results_buffer,
                                 "MPM_B2G_RESULTS",
-                                 sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint16_t) *
+                                 (profile->packet_size_limit + 1) *
+                                 profile->packet_buffer_limit,
                                 NULL, module_data->handle) == -1) {
        goto error;
    }
@ -1802,22 +1819,23 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)

    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_buffer,
                                 "MPM_B2G_PACKETS_BUFFER",
-                                 (sizeof(SCCudaPBPacketDataForGPU) *
-                                  SC_CUDA_PB_MIN_NO_OF_PACKETS),
+                                 profile->packet_buffer_limit *
+                                 (profile->packet_size_limit +
+                                  sizeof(SCCudaPBPacketDataForGPUNonPayload)),
                                 NULL, module_data->handle) == -1) {
        goto error;
    }

    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_offset_buffer,
                                 "MPM_B2G_PACKETS_BUFFER_OFFSETS",
-                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint32_t) * profile->packet_buffer_limit,
                                 NULL, module_data->handle) == -1) {
        goto error;
    }

    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_payload_offset_buffer,
                                 "MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS",
-                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 sizeof(uint32_t) * profile->packet_buffer_limit,
                                 NULL, module_data->handle) == -1) {
        goto error;
    }
@ -1882,6 +1900,7 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
 TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
 {
    B2gCudaMpmThreadCtxData *tctx = data;
+    MpmCudaConf *profile = NULL;

    if (tctx == NULL) {
        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments.  data NULL\n");
@ -1898,16 +1917,22 @@ TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
                   "module_data if we are having a module_handle");
        goto error;
    }
-    if (SCCudaHlGetCudaContext(&dummy_context, tctx->b2g_cuda_module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", tctx->b2g_cuda_module_handle) == -1) {
        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                   "module %s", module_data->name);
        goto error;
    }
    SCCudaCtxPushCurrent(dummy_context);

-    if (SCCudaMemFreeHost(tctx->results_buffer) == -1)
-        SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
-                   "results_buffer\n");
+    profile = SCCudaHlGetProfile("mpm");
+    if (profile->page_locked) {
+        if (SCCudaMemFreeHost(tctx->results_buffer) == -1) {
+            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                       "results_buffer\n");
+        }
+    } else {
+        free(tctx->results_buffer);
+    }
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_RESULTS", tctx->b2g_cuda_module_handle);
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER", tctx->b2g_cuda_module_handle);
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER_OFFSETS",
@ -2291,7 +2316,7 @@ static int B2gCudaTest01(void)

    /* get the cuda context and push it */
    CUcontext dummy_context;
-    if (SCCudaHlGetCudaContext(&dummy_context, module_handle) == -1) {
+    if (SCCudaHlGetCudaContext(&dummy_context, "mpm", module_handle) == -1) {
        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                   "module SC_RULES_CONTENT_B2G_CUDA");
    }
@ -2323,6 +2348,7 @@ static int B2gCudaTest01(void)

    result = 1;

+    SCCudaPBSetProfile("mpm");
    pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
    SCCudaPBPacketDataForGPU *curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer;

@ -2500,6 +2526,7 @@ static int B2gCudaTest02(void)
    }
    SigGroupBuild(de_ctx);

+    SCCudaPBSetProfile("mpm");
    SCCudaPBSetUpQueuesAndBuffers();

    /* get the queues used by the batcher thread */
@ -2800,6 +2827,7 @@ static int B2gCudaTest03(void)
    SigGroupBuild(de_ctx);
    DetectEngineThreadCtxInit(&de_tv, (void *)de_ctx, (void *)&det_ctx);

+    SCCudaPBSetProfile("mpm");
    SCCudaPBSetUpQueuesAndBuffers();

    /* get the queues used by the batcher thread */
--- a/src/util-mpm.c
+++ b/src/util-mpm.c
@ -456,6 +456,717 @@ uint32_t MpmGetBloomSize(const char *conf_val)
    SCReturnInt(bloom_value);
 }

+
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Parse the "mpm" profile under the cuda subsection of our conf file.
+ *
+ * \retval profile Pointer to a struct containing the parsed data.
+ */
+MpmCudaConf *MpmCudaConfParse(void)
+{
+    ConfNode *cuda_node = NULL;
+    ConfNode *seq_node = NULL;
+
+    MpmCudaConf *profile = NULL;
+
+    const char *packet_buffer_limit = NULL;
+    const char *packet_size_limit = NULL;
+    const char *packet_buffers = NULL;
+    const char *batching_timeout = NULL;
+    const char *page_locked = NULL;
+    const char *device_id = NULL;
+
+    if ((profile = malloc(sizeof(MpmCudaConf))) == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(profile, 0, sizeof(MpmCudaConf));
+    profile->packet_buffer_limit = MPM_PACKET_BUFFER_LIMIT;
+    profile->packet_size_limit = MPM_PACKET_SIZE_LIMIT;
+    profile->packet_buffers = MPM_PACKET_BUFFERS;
+    profile->batching_timeout = MPM_BATCHING_TIMEOUT;
+    profile->page_locked = MPM_PAGE_LOCKED;
+    profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+
+    cuda_node = ConfGetNode("cuda");
+    if (cuda_node == NULL) {
+        SCLogInfo("No conf found for \"cuda\" in yaml file.  Use default conf");
+        goto end;
+    }
+
+    TAILQ_FOREACH(seq_node, &cuda_node->head, next) {
+        if (strcasecmp(seq_node->val, "mpm") == 0) {
+            packet_buffer_limit = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_buffer_limit");
+            packet_size_limit = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_size_limit");
+            packet_buffers = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "packet_buffers");
+            batching_timeout = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "batching_timeout");
+            page_locked = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "page_locked");
+            device_id = ConfNodeLookupChildValue
+                (seq_node->head.tqh_first, "device_id");
+
+            /* packet_buffer_size */
+            if (packet_buffer_limit == NULL || strcasecmp(packet_buffer_limit, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_buffer_limit.  Either NULL or empty");
+            } else {
+                profile->packet_buffer_limit = atoi(packet_buffer_limit);
+                if (profile->packet_buffer_limit <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_buffer_limit - %s", packet_buffer_limit);
+                    profile->packet_buffer_limit = MPM_PACKET_BUFFER_LIMIT;
+                }
+            }
+
+            /* packet_size_limit */
+            if (packet_size_limit == NULL || strcasecmp(packet_size_limit, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_size_limit.  Either NULL or empty");
+            } else {
+                profile->packet_size_limit = atoi(packet_size_limit);
+                if (profile->packet_size_limit <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_size_limit - %s", packet_size_limit);
+                    profile->packet_size_limit = MPM_PACKET_SIZE_LIMIT;
+                }
+            }
+
+            /* packet_buffers */
+            if (packet_buffers == NULL || strcasecmp(packet_buffers, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.packet_buffers.  Either NULL or empty");
+            } else {
+                profile->packet_buffers = atoi(packet_buffers);
+                if (profile->packet_buffers <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.packet_buffers - %s", packet_buffers);
+                    profile->packet_buffers = MPM_PACKET_BUFFERS;
+                }
+            }
+
+            /* batching_timeout */
+            if (batching_timeout == NULL || strcasecmp(batching_timeout, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.batching_timeout.  Either NULL or empty");
+            } else {
+                profile->batching_timeout = atoi(batching_timeout);
+                if (profile->batching_timeout <= 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.batching_timeout - %s", batching_timeout);
+                    profile->batching_timeout = MPM_BATCHING_TIMEOUT;
+                }
+            }
+
+            /* page_locked */
+            if (page_locked == NULL || strcasecmp(page_locked, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.page_locked.  Either NULL or empty");
+            } else {
+                if (strcasecmp(page_locked, "enabled") == 0) {
+                    profile->page_locked = MPM_PAGE_LOCKED;
+                } else if (strcasecmp(page_locked, "disabled") == 0) {
+                    profile->page_locked = !MPM_PAGE_LOCKED;
+                } else {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.page_locked - %s", page_locked);
+                }
+            }
+
+            /* device_id */
+            if (device_id == NULL || strcasecmp(device_id, "") == 0) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                           "cuda.mpm.device_id  Either NULL or empty");
+                profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+                continue;
+            } else {
+                profile->device_id = atoi(device_id);
+                if (profile->device_id < 0) {
+                    SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for "
+                               "cuda.mpm.device_id - %s", device_id);
+                    profile->device_id = SC_CUDA_DEFAULT_DEVICE;
+                    continue;
+                }
+            }
+        } /* if (strcasecmp(seq_node->val, "mpm") == 0) */
+    } /* TAILQ_FOREACH(seq_node, &cuda_node->head, next) */
+
+ end:
+    SCLogDebug("Configuration for \"cuda.mpm\"\n"
+               "packet_buffer_size: %u\n"
+               "packet_size_limit: %d\n"
+               "packet_buffers: %d\n"
+               "batching_timeout: %d\n"
+               "page_locked: %d\n"
+               "device_id: %d\n",
+               profile->packet_buffer_limit, profile->packet_size_limit,
+               profile->packet_buffers, profile->batching_timeout,
+               profile->page_locked, profile->device_id);
+
+    return profile;
+}
+
+/**
+ * \brief Cleanup the parsed "mpm" profile cuda conf.
+ */
+void MpmCudaConfCleanup(MpmCudaConf *conf)
+{
+    if (conf != NULL)
+        free(conf);
+
+    return;
+}
+
+#endif /* __SC_CUDA_SUPPORT */
+
+/************************************Unittests*********************************/
+
+static int MpmInitYamlConf(char *conf)
+{
+    ConfCreateContextBackup();
+    ConfInit();
+    return ConfYamlLoadString(conf, strlen(conf));
+}
+
+static void MpmDeInitYamlConf(void)
+{
+    ConfDeInit();
+    ConfRestoreContextBackup();
+
+    return;
+}
+
+static int MpmTest01(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 4000\n"
+        "      packet_size_limit: 1500\n"
+        "      packet_buffers: 10\n"
+        "      batching_timeout: 1\n"
+        "      page_locked: enabled\n"
+        "      device_id: 0\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == 4000);
+    result &= (profile->packet_size_limit == 1500);
+    result &= (profile->packet_buffers == 10);
+    result &= (profile->batching_timeout == 1);
+    result &= (profile->page_locked == 1);
+    result &= (profile->device_id == 0);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest02(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 4001\n"
+        "      packet_size_limit: 1500\n"
+        "      packet_buffers: 12\n"
+        "      batching_timeout: 10\n"
+        "      page_locked: disabled\n"
+        "      device_id: 5\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == 4001);
+    result &= (profile->packet_size_limit == 1500);
+    result &= (profile->packet_buffers == 12);
+    result &= (profile->batching_timeout == 10);
+    result &= (profile->page_locked == 0);
+    result &= (profile->device_id == 5);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest03(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: 0\n"
+        "      packet_size_limit: 0\n"
+        "      packet_buffers: 0\n"
+        "      batching_timeout: 0\n"
+        "      page_locked: enbled\n"
+        "      device_id: -1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest04(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: -1\n"
+        "      packet_size_limit: -1\n"
+        "      packet_buffers: -1\n"
+        "      batching_timeout: -1\n"
+        "      page_locked: enbled\n"
+        "      device_id: -1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest05(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit:\n"
+        "      packet_size_limit:\n"
+        "      packet_buffers:\n"
+        "      batching_timeout: 2\n"
+        "      page_locked: enabled\n"
+        "      device_id: 1\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == 2);
+    result &= (profile->page_locked == 1);
+    result &= (profile->device_id == 1);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest06(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit: \n"
+        "      packet_size_limit: \n"
+        "      packet_buffers: \n"
+        "      batching_timeout: \n"
+        "      page_locked: \n"
+        "      device_id: \n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest07(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_buffer_limit:\n"
+        "      packet_size_limit:\n"
+        "      packet_buffers:\n"
+        "      batching_timeout:\n"
+        "      page_locked:\n"
+        "      device_id:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest08(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n"
+        "      packet_size_limit: 2000\n"
+        "      page_locked: disabled\n"
+        "      device_id: 4\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == 2000);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == !MPM_PAGE_LOCKED);
+    result &= (profile->device_id == 4);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest09(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n"
+        "  - mpm:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest10(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n"
+        "cuda:\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
+static int MpmTest11(void)
+{
+    char *conf =
+        "%YAML 1.1\n"
+        "---\n";
+
+    DetectEngineCtx *de_ctx = NULL;
+    int result = 0;
+
+    if (MpmInitYamlConf(conf) == -1)
+        return 0;
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL)
+        goto end;
+
+    SCCudaHlBackupRegisteredProfiles();
+    SCCudaHlGetYamlConf();
+    MpmCudaConf *profile = SCCudaHlGetProfile("mpm");
+    if (profile == NULL) {
+        printf("Error retrieving mpm profile\n");
+        goto end;
+    }
+
+    result = (profile->packet_buffer_limit == MPM_PACKET_BUFFER_LIMIT);
+    result &= (profile->packet_size_limit == MPM_PACKET_SIZE_LIMIT);
+    result &= (profile->packet_buffers == MPM_PACKET_BUFFERS);
+    result &= (profile->batching_timeout == MPM_BATCHING_TIMEOUT);
+    result &= (profile->page_locked == MPM_PAGE_LOCKED);
+    result &= (profile->device_id == SC_CUDA_DEFAULT_DEVICE);
+
+ end:
+    SCCudaHlCleanProfiles();
+
+    if (de_ctx != NULL)
+        DetectEngineCtxFree(de_ctx);
+    MpmDeInitYamlConf();
+    SCCudaHlRestoreBackupRegisteredProfiles();
+
+    return result;
+}
+
 void MpmRegisterTests(void) {
 #ifdef UNITTESTS
    uint16_t i;
@ -467,6 +1178,17 @@ void MpmRegisterTests(void) {
            printf("Warning: mpm %s has no unittest registration function...", mpm_table[i].name);
        }
    }
+
+    UtRegisterTest("MpmTest01", MpmTest01, 1);
+    UtRegisterTest("MpmTest02", MpmTest02, 1);
+    UtRegisterTest("MpmTest03", MpmTest03, 1);
+    UtRegisterTest("MpmTest04", MpmTest04, 1);
+    UtRegisterTest("MpmTest05", MpmTest05, 1);
+    UtRegisterTest("MpmTest06", MpmTest06, 1);
+    UtRegisterTest("MpmTest07", MpmTest07, 1);
+    UtRegisterTest("MpmTest08", MpmTest08, 1);
+    UtRegisterTest("MpmTest09", MpmTest09, 1);
+    UtRegisterTest("MpmTest10", MpmTest10, 1);
+    UtRegisterTest("MpmTest11", MpmTest11, 1);
 #endif
 }
-
--- a/src/util-mpm.h
+++ b/src/util-mpm.h
@ -50,6 +50,13 @@
                                             pattern matcher algorithms */
 #define BLOOMSIZE_HIGH          2048    /**< High bloomfilter size for the multi
                                             pattern matcher algorithms */
+
+#define MPM_PACKET_BUFFER_LIMIT 2400
+#define MPM_PACKET_SIZE_LIMIT   1500
+#define MPM_PACKET_BUFFERS      10
+#define MPM_BATCHING_TIMEOUT    1
+#define MPM_PAGE_LOCKED         1
+
 enum {
    MPM_NOTSET = 0,

@ -178,12 +185,35 @@ MpmCtx *MpmFactoryGetMpmCtxForProfile(int32_t);
 void MpmFactoryDeRegisterAllMpmCtxProfiles(void);
 int32_t MpmFactoryIsMpmCtxAvailable(MpmCtx *);

+/* macros decides if cuda is enabled for the platform or not */
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Cuda configuration for "mpm" profile.  We can further extend this
+ *        to have conf for specific mpms.  For now its common for all mpms.
+ */
+typedef struct MpmCudaConf_ {
+    int32_t packet_buffer_limit;
+    int16_t packet_size_limit;
+    int8_t packet_buffers;
+    int8_t batching_timeout;
+    int8_t page_locked;
+    int8_t device_id;
+} MpmCudaConf;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 int PmqSetup(PatternMatcherQueue *, uint32_t, uint32_t);
 void PmqMerge(PatternMatcherQueue *src, PatternMatcherQueue *dst);
 void PmqReset(PatternMatcherQueue *);
 void PmqCleanup(PatternMatcherQueue *);
 void PmqFree(PatternMatcherQueue *);

+#ifdef __SC_CUDA_SUPPORT__
+MpmCudaConf *MpmCudaConfParse(void);
+void MpmCudaConfCleanup(MpmCudaConf *);
+#endif /* __SC_CUDA_SUPPORT */
+
 void MpmTableSetup(void);
 void MpmRegisterTests(void);

@ -197,4 +227,3 @@ uint32_t MpmGetHashSize(const char *);
 uint32_t MpmGetBloomSize(const char *);

 #endif /* __UTIL_MPM_H__ */
-
--- a/suricata.yaml
+++ b/suricata.yaml
@ -152,11 +152,32 @@ threading:
  #
  detect_thread_ratio: 1.5

-# Select the cuda device to use.  The device_id identifies the device to be used
-# if one has multiple devices on the system.  To find out device_id associated
-#  with the card(s) on the system run "suricata --list-cuda-cards".
+# Cuda configuration.
 cuda:
-  device_id: 0
+  # The "mpm" profile.  On not specifying any of these parameters, the engine's
+  # internal default values are used, which are same as the ones specified here.
+  - mpm:
+      # Threshold limit for no of packets buffered to the GPU.  Once we hit this
+      # limit, we pass the buffer to the gpu.
+      packet_buffer_limit: 2400
+      # The maximum length for a packet that we would buffer to the gpu.
+      # Anything over this is MPM'ed on the CPU.  All entries > 0 are valid.
+      packet_size_limit: 1500
+      # No of packet buffers we initialize.  All entries > 0 are valid.
+      packet_buffers: 10
+      # The timeout limit for batching of packets in secs.  If we don't fill the
+      # buffer within this timeout limit, we pass the currently filled buffer to the gpu.
+      # All entries > 0 are valid.
+      batching_timeout: 1
+      # Specifies whether to use page_locked memory whereever possible.  Accepted values
+      # are "enabled" and "disabled".
+      page_locked: enabled
+      # The device to use for the mpm.  Currently we don't support load balancing
+      # on multiple gpus.  In case you have multiple devices on your system, you
+      # can specify the device to use, using this conf.  By default we hold 0, to
+      # specify the first device cuda sees.  To find out device_id associated with
+      # the card(s) on the system run "suricata --list-cuda-cards".
+      device_id: 0

 # Select the multi pattern algorithm you want to run for scan/search the
 # in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,