/**
 * Copyright (c) 2010 Open Information Security Foundation.
 *
 * \author Anoop Saldanha <poonaatsoc@gmail.com>
 *
 * \todo
 *       1 Implement a gpu version of aho-corasick.  That should get rid of a
 *         lot of post processing and pattern_chopping, and we don't have to
 *         deal with one or two byte patterns. (currently in process)
 *       2 Use texture/shared memory.  This should be handled along with 1 and 6.
 *       3 Currently a lot of packets(~17k) are getting stuck on the detection
 *         thread, which is a major bottleneck.  Introduce bypass detection
 *         threads for these 15k non buffered packets and check how the alerts
 *         are affected by this(out of sequence handling by detection threads).
 *       4 Test the use of mapped memory(if possible anywhere).
 *       5 Check parallelising memcopies with kernel execution.
 *       6 Test this feature - Rearrange the packet stream(either on cpu or gpu),
 *         where each block in the gpu can access the packet with non-coalesced
 *         reads.
 *         2 packets p1 -> aabb ccdd
 *                   p2 -> eeff gghh
 *
 *         stream -> aabbeeffccddgghh.
 *
 *         Modify the block size to 16 threads for CC < 2.0 devices and 32 for
 *         for >= 2.0.
 *
 *         The rearrangement of packet stream can be done on the gpu, with no
 *         perf degradation, using coalesced reads.  Padding packets need to
 *         be addressed though.
 *         (Need to give more thought to this task).
 *
 *         -- Feel free to pick any task from the agenda, but please
 *         drop a mail to dev mailing list(or directly to the dev team).  Better
 *         yet, open a feature request on our bug/feature tracker
 *         (https://redmine.openinfosecfoundation.org/issues).  Will be a mess if
 *         2 or more devs end up working on the same task or related tasks.
 */

/* compile in, only if we have a CUDA enabled on this machine */
#ifdef __SC_CUDA_SUPPORT__

#include "suricata-common.h"
#include "suricata.h"

#include "detect.h"
#include "decode.h"
#include "flow.h"
#include "data-queue.h"

#include "threads.h"
#include "threadvars.h"
#include "tm-queuehandlers.h"
#include "tm-modules.h"

#include "cuda-packet-batcher.h"
#include "conf.h"

#include "util-error.h"
#include "util-debug.h"
#include "util-unittest.h"

#include "util-mpm-b2g-cuda.h"
#include "util-cuda-handlers.h"
#include "detect-engine-address.h"
#include "detect-engine-port.h"
#include "detect-engine.h"
#include "detect-parse.h"
#include "tm-threads.h"
#include "tmqh-packetpool.h"
#include "util-mpm.h"

/* \todo Make this user configurable through our yaml file.  Also provide options
 * where this can be dynamically updated based on the traffic */
#define SC_CUDA_PB_BATCHER_ALARM_TIME 1

/* holds the inq and outq between the cuda-packet-batcher TM and the cuda b2g mpm
 * dispatcher thread */
static Tmq *tmq_inq = NULL;
static Tmq *tmq_outq = NULL;

/* holds the packet inq between the batcher TM and, the TM feeding it packets
 * in the runmode sequence.  We will need this to implement the alarm.  We will
 * have a SIG_ALRM delivered every SC_CUDA_PB_BATCHER_ALARM_TIME seconds, after
 * which we willf set a flag informing the batcher TM to queue the buffer to the
 * GPU and wake the batcher thread, in case it is waiting on a conditional for a
 * packet from the previous TM in the runmode */
static Tmq *tmq_batcher_inq = NULL;

/* used to indicate if we want to stop buffering the packets anymore.  We
 * we will need this while we want to shut the engine down
 * \todo give a better description */
static int run_batcher = 1;

/* indicates the maximum no of packets we are ready to buffer.  Theoretically the
 * maximum value held by this var can't exceed the value held by
 * "max_pending_packets".  Either ways we should make this user configurable like
 * SC_CUDA_PB_BATCHER_ALARM_TIME.  Also allow dynamic updates to this value based
 * on the traffic
 * \todo make this user configurable, as well allow dynamic update of this
 * variable based on the traffic seen */
static uint32_t buffer_packet_threshhold = 0;

/* the profile used by the cuda batcher */
static MpmCudaConf *profile = NULL;

/* flag used by the SIG_ALRM handler to indicate that the batcher TM should queue
 * the buffer to be processed by the Cuda Mpm B2g Batcher Thread for further
 * processing on the GPU */
static int queue_buffer = 0;

/* struct to configure the SIG_ALRM frequency. */
static struct itimerval itimer = {{0, 0}, {0, 0}};

static int unittest_mode = 0;

/**
 * \internal
 * \brief The SIG_ALRM handler.  We will set the "queue_buffer" flag thus
 *        informing the batcher TM that it needs to queue the buffer.  We
 *        also signal the cond var for the batcher TM inq(the one it
 *        receives packets from), incase it is waiting on the conditional
 *        for a new packet from the previous TM in the runmodes list.
 *
 * \param signum The signal number that this function just woke up to.  In
 *               our case it is SIG_ALRM.
 */
static void SCCudaPBSetQueueBufferFlag(int signum)
{
    SCLogDebug("Cuda Packet Batche alarm generated after %f seconds.  Set the"
               "queue_buffer flag and signal the cuda TM inq.",
               profile->batching_timeout);
    queue_buffer = 1;
    SCCondSignal(&((&trans_q[tmq_batcher_inq->id])->cond_q));

    return;
}

/**
 * \internal.
 * \brief Set the SIG_ALRM handler
 */
static void SCCudaPBSetBatcherAlarmTimeHandler()
{
    struct sigaction action;

    SCLogDebug("Setting the SIGALRM handler for the Cuda Batcher TM");
    action.sa_handler = SCCudaPBSetQueueBufferFlag;
    sigemptyset(&(action.sa_mask));
    sigaddset(&(action.sa_mask), SIGALRM);
    action.sa_flags = 0;
    sigaction(SIGALRM, &action, 0);

    itimer.it_value.tv_sec = profile->batching_timeout;
    itimer.it_value.tv_usec = (profile->batching_timeout
                               - (int32_t) profile->batching_timeout) * 1000000;

    return;
}

/**
 * \internal
 * \brief Reset the batcher alarm.
 */
static inline void SCCudaPBResetBatcherAlarm()
{
    queue_buffer = 0;

    /* if we are running unittests, don't set the alarm handler.  It will only
     * cause a seg fault if the tests take too long */
    if (!unittest_mode) {
        /* \todo We could update itimer dynamically based on the traffic */
        setitimer(ITIMER_REAL, &itimer, NULL);
    }
}

/**
 * \internal
 * \brief Used to retrieve the Signature Group Head for a packet.
 *
 * \param de_ctx Pointer the detection engine context to search for the
 *               sgh for an incoming packet.
 * \param p      Pointer to the incoming packet for which we will have to
 *               search for a sgh.
 *
 * \retval sgh Pointer to the relevant matching sgh for the Packet.
 */
static SigGroupHead *SCCudaPBGetSgh(DetectEngineCtx *de_ctx, Packet *p)
{
    int f;
    SigGroupHead *sgh = NULL;

    /* select the flow_gh */
    if (p->flowflags & FLOW_PKT_TOCLIENT)
        f = 0;
    else
        f = 1;

    /* find the right mpm instance */
    DetectAddress *ag = DetectAddressLookupInHead(de_ctx->flow_gh[f].src_gh[p->proto], &p->src);
    if (ag != NULL) {
        /* source group found, lets try a dst group */
        ag = DetectAddressLookupInHead(ag->dst_gh,&p->dst);
        if (ag != NULL) {
            if (ag->port == NULL) {
                SCLogDebug("we don't have ports");
                sgh = ag->sh;
            } else {
                SCLogDebug("we have ports");

                DetectPort *sport = DetectPortLookupGroup(ag->port,p->sp);
                if (sport != NULL) {
                    DetectPort *dport = DetectPortLookupGroup(sport->dst_ph, p->dp);
                    if (dport != NULL) {
                        sgh = dport->sh;
                    } else {
                        SCLogDebug("no dst port group found for the packet with dp %"PRIu16, p->dp);
                    }
                } else {
                    SCLogDebug("no src port group found for the packet with sp %"PRIu16, p->sp);
                }
            }
        } else {
            SCLogDebug("no dst address group found for the packet");
        }
    } else {
        SCLogDebug("no src address group found for the packet");
    }

    return sgh;
}

/**
 * \internal
 * \brief Handles the queuing of the buffer from this batcher TM to the cuda
 *        mpm b2g dispatcher TM.
 *
 * \tctx The batcher thread context that holds the current operational buffer
 *       which has to be buffered by this function.
 */
static void SCCudaPBQueueBuffer(SCCudaPBThreadCtx *tctx)
{
    SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb;
    uint32_t nop_in_buffer = pb->nop_in_buffer;
    uint32_t *packets_offset_buffer = pb->packets_offset_buffer;
    uint32_t offset = *(packets_offset_buffer + nop_in_buffer - 1);
    SCCudaPBPacketDataForGPU *last_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer +
                                                                         offset);

    /* if we have no packets buffered in so far, get out */
    if (pb->nop_in_buffer == 0) {
        SCLogDebug("No packets buffered in so far in the cuda buffer.  Returning");
        return;
    }

    /* calculate the total length of all the packets buffered in */
    pb->packets_buffer_len = pb->packets_offset_buffer[pb->nop_in_buffer - 1] +
        sizeof(SCCudaPBPacketDataForGPUNonPayload) +
        last_packet->payload_len;

    pb->packets_total_payload_len = pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] +
        last_packet->payload_len;

    /* enqueue the buffer in the outq to be consumed by the dispatcher TM */
    SCDQDataQueue *dq_outq = &data_queues[tmq_outq->id];
    SCMutexLock(&dq_outq->mutex_q);
    SCDQDataEnqueue(dq_outq, (SCDQGenericQData *)tctx->curr_pb);
    SCCondSignal(&dq_outq->cond_q);
    SCMutexUnlock(&dq_outq->mutex_q);

    while (run_batcher) {
        /* dequeue a new buffer */
        SCDQDataQueue *dq_inq = &data_queues[tmq_inq->id];
        SCMutexLock(&dq_inq->mutex_q);
        if (dq_inq->len == 0) {
            /* if we have no data in queue, wait... */
            SCCondWait(&dq_inq->cond_q, &dq_inq->mutex_q);
        }

        if (run_batcher == 0) {
            break;
        }

        if (dq_inq->len > 0) {
            tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq_inq);
            tctx->curr_pb->nop_in_buffer = 0;
            tctx->curr_pb->packets_buffer_len = 0;
            tctx->curr_pb->packets_total_payload_len = 0;
            SCMutexUnlock(&dq_inq->mutex_q);
            SCLogDebug("Dequeued a new packet buffer for the cuda batcher TM");
            break;
        } else {
            /* Should only happen on signals. */
            SCMutexUnlock(&dq_inq->mutex_q);
            SCLogDebug("Unable to Relooping in the quest to dequeue new buffer");
        }
    } /* while (run_batcher) */

    return;
}

/**
 * \brief Custom slot function used by the Batcher TM.
 *
 * \param td Pointer to the ThreadVars instance.  In this case the batcher TM's
 *           ThreadVars instance.
 */
void *SCCudaPBTmThreadsSlot1(void *td)
{
    ThreadVars *tv = (ThreadVars *)td;
    Tm1Slot *s = (Tm1Slot *)tv->tm_slots;
    Packet *p = NULL;
    char run = 1;
    TmEcode r = TM_ECODE_OK;

    /* Set the thread name */
    SCSetThreadName(tv->name);

    if (tv->thread_setup_flags != 0) {
        TmThreadSetupOptions(tv);
    }

    SCLogDebug("%s starting", tv->name);

    if (s->s.SlotThreadInit != NULL) {
        r = s->s.SlotThreadInit(tv, s->s.slot_initdata, &s->s.slot_data);
        if (r != TM_ECODE_OK) {
            EngineKill();

            TmThreadsSetFlag(tv, THV_CLOSED);
            pthread_exit((void *) -1);
        }
    }
    memset(&s->s.slot_pre_pq, 0, sizeof(PacketQueue));
    memset(&s->s.slot_post_pq, 0, sizeof(PacketQueue));

    TmThreadsSetFlag(tv, THV_INIT_DONE);
    while(run) {
        TmThreadTestThreadUnPaused(tv);

        /* input a packet */
        p = tv->tmqh_in(tv);

        if (p == NULL) {
            SCLogDebug("packet is NULL for TM: %s", tv->name);
            /* the only different between the actual Slot1 function in
             * tm-threads.c and this custom Slot1 function is this call
             * here.  We need to make the call here, even if we don't
             * receive a packet from the previous stage in the runmodes.
             * This is needed in cases where the SIG_ALRM handler
             * wants us to queue the buffer to the GPU and ends up waking
             * the Batcher TM(which is waiting on a cond from the previous
             * feeder TM).  Please handle the NULL packet case in the
             * function that you now call */
            r = s->s.SlotFunc(tv, p, s->s.slot_data, NULL, NULL);
        } else {
            r = s->s.SlotFunc(tv, p, s->s.slot_data, NULL, NULL);
            /* handle error */
            if (r == TM_ECODE_FAILED) {
                TmqhOutputPacketpool(tv, p);
                TmThreadsSetFlag(tv, THV_FAILED);
                break;
            }

            /* output the packet */
            tv->tmqh_out(tv, p);
        }

        if (TmThreadsCheckFlag(tv, THV_KILL)) {
            SCPerfUpdateCounterArray(tv->sc_perf_pca, &tv->sc_perf_pctx, 0);
            run = 0;
        }
    }

    if (s->s.SlotThreadExitPrintStats != NULL) {
        s->s.SlotThreadExitPrintStats(tv, s->s.slot_data);
    }

    if (s->s.SlotThreadDeinit != NULL) {
        r = s->s.SlotThreadDeinit(tv, s->s.slot_data);
        if (r != TM_ECODE_OK) {
            TmThreadsSetFlag(tv, THV_CLOSED);
            pthread_exit((void *) -1);
        }
    }

    SCLogDebug("%s ending", tv->name);
    TmThreadsSetFlag(tv, THV_CLOSED);
    pthread_exit((void *) 0);
}

/**
 * \brief Used to de-allocate an instance of SCCudaPBPacketsBuffer.
 *
 * \param pb Pointer to the SCCudaPacketsBuffer instance to be de-alloced.
 */
void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *pb)
{
    if (pb == NULL)
        return;

    if (pb->packets_buffer != NULL) {
        if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                       "packets_buffer");
        }
    }
    if (pb->packets_offset_buffer != NULL) {
        if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                       "packets_offset_buffer");
        }
    }
    if (pb->packets_payload_offset_buffer != NULL) {
        if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory "
                       "packets_payload_offset_buffer");
        }
    }
    if (pb->packets_address_buffer != NULL)
        free(pb->packets_address_buffer);

    free(pb);

    return;
}

/**
 * \brief Allocates a new instance of SCCudaPBPacketsBuffer.
 *
 * \param pb The newly created instance of SCCudaPBPacketsBuffer.
 */
SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
{
    SCCudaPBPacketsBuffer *pb = malloc(sizeof(SCCudaPBPacketsBuffer));
    if (pb == NULL) {
        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
        exit(EXIT_FAILURE);
    }
    memset(pb, 0, sizeof(SCCudaPBPacketsBuffer));

    /* Register new module, needed for some unit tests */
    if (SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER") == -1) {
        SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");
    }

    /* the buffer for the packets to be sent over to the gpu.  We allot space for
     * profile->packet_buffer_limit packets, assuming a size of
     * profile->packet_size_limit for each packet */
    SCCudaHlModuleData *data = NULL;
    data = SCCudaHlGetModuleData(SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER"));
    if (data == NULL) {
        SCLogDebug("Module not registered.  To avail the benefits of this "
                   "registration facility, first register a module using "
                   "context using SCCudaHlRegisterModule(), after which you "
                   "can call this function");
        return NULL;
    }

    if (SCCudaHlGetCudaContext(&data->cuda_context, "mpm", data->handle) == -1) {
        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Error getting cuda context");
        return NULL;
    }

    if (SCCudaCtxPushCurrent(data->cuda_context) == -1) {
        SCLogError(SC_ERR_CUDA_HANDLER_ERROR,
                   "Error pushing cuda context to allocate memory");
    }

    if (profile->page_locked) {
        if (SCCudaMemHostAlloc((void**)&pb->packets_buffer,
                               profile->packet_buffer_limit *
                               (profile->packet_size_limit +
                                sizeof(SCCudaPBPacketDataForGPUNonPayload)),
                               CU_MEMHOSTALLOC_PORTABLE |
                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
            exit(EXIT_FAILURE);
        }
    } else {
        pb->packets_buffer = malloc(profile->packet_buffer_limit *
                                    (profile->packet_size_limit +
                                     sizeof(SCCudaPBPacketDataForGPUNonPayload)));
        if (pb->packets_buffer == NULL) {
            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
            exit(EXIT_FAILURE);
        }
    }
    memset(pb->packets_buffer, 0, profile->packet_buffer_limit *
           (profile->packet_size_limit + sizeof(SCCudaPBPacketDataForGPUNonPayload)));

    if (profile->page_locked) {
        /* used to hold the offsets of the buffered packets in the packets_buffer */
        if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer,
                               sizeof(uint32_t) * profile->packet_buffer_limit,
                               CU_MEMHOSTALLOC_PORTABLE |
                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
            exit(EXIT_FAILURE);
        }
    } else {
        pb->packets_offset_buffer = malloc(sizeof(uint32_t) *
                                           profile->packet_buffer_limit);
        if (pb->packets_offset_buffer == NULL) {
            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
            exit(EXIT_FAILURE);
        }
    }
    memset(pb->packets_offset_buffer, 0,
           sizeof(uint32_t) * profile->packet_buffer_limit);

    if (profile->page_locked) {
        /* used to hold the offsets of the packets payload */
        if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer,
                               sizeof(uint32_t) * profile->packet_buffer_limit,
                               CU_MEMHOSTALLOC_PORTABLE |
                               CU_MEMHOSTALLOC_WRITECOMBINED) == -1) {
            SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory");
            exit(EXIT_FAILURE);
        }
    } else {
        pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) *
                                                   profile->packet_buffer_limit);
        if (pb->packets_payload_offset_buffer == NULL) {
            SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
            exit(EXIT_FAILURE);
        }
    }
    memset(pb->packets_payload_offset_buffer, 0,
           sizeof(uint32_t) * profile->packet_buffer_limit);

    SCLogDebug("Allocated pagelocked CUDA memory");
    if (SCCudaCtxPopCurrent(NULL) == -1) {
        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Could not pop cuda context");
    }

    /* used to hold the packet addresses for all the packets buffered inside
     * packets_buffer */
    pb->packets_address_buffer = malloc(sizeof(Packet *) *
                                        profile->packet_buffer_limit);
    if (pb->packets_address_buffer == NULL) {
        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
        exit(EXIT_FAILURE);
    }
    memset(pb->packets_address_buffer, 0, sizeof(Packet *) *
           profile->packet_buffer_limit);

    return pb;
}

/**
 * \brief Registration function for the Cuda Packet Batcher TM.
 */
void TmModuleCudaPacketBatcherRegister(void)
{
    tmm_modules[TMM_CUDA_PACKET_BATCHER].name = "CudaPacketBatcher";
    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadInit = SCCudaPBThreadInit;
    tmm_modules[TMM_CUDA_PACKET_BATCHER].Func = SCCudaPBBatchPackets;
    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadExitPrintStats = SCCudaPBThreadExitStats;
    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadDeinit = SCCudaPBThreadDeInit;
    tmm_modules[TMM_CUDA_PACKET_BATCHER].RegisterTests = SCCudaPBRegisterTests;

    return;
}

/**
 * \brief The cuda batcher TM init function.
 *
 * \param tv       The cuda packet batcher TM ThreadVars instance.
 * \param initdata The initialization data needed by this cuda batcher TM.
 * \param data     Pointer to a ponter memory location that would be updated
 *                 with the newly created thread ctx instance.
 *
 * \retval TM_ECODE_OK     On success.
 * \retval TM_ECODE_FAILED On failure.
 */
TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data)
{
    SCCudaPBThreadCtx *tctx = NULL;

    if (initdata == NULL) {
        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument.  initdata NULL "
                   "for the cuda batcher TM init thread function");
        return TM_ECODE_FAILED;
    }

    tctx = malloc(sizeof(SCCudaPBThreadCtx));
    if (tctx == NULL) {
        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
        exit(EXIT_FAILURE);
    }
    memset(tctx, 0, sizeof(SCCudaPBThreadCtx));

    /* the detection engine context.  We will need it to retrieve the sgh,
     * when we start receiving and batching the packets */
    tctx->de_ctx = initdata;

    /* the first packet buffer from the queue */
    tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(&data_queues[tmq_inq->id]);

    /* register new module */
    SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");

    *data = tctx;

    /* we will need the cuda packet batcher TM's inq for further use later.  Read
     * the comments associated with this var definition, for its use */
    tmq_batcher_inq = tv->inq;

    /* set the SIG_ALRM handler */
    SCCudaPBSetBatcherAlarmTimeHandler();

    /* Set the alarm time limit during which the batcher thread would
     * buffer packets */
    SCCudaPBResetBatcherAlarm();

    return TM_ECODE_OK;
}

/**
 * \brief Batches packets into the packets buffer.
 *
 * \param tv   Pointer to the ThreadVars instance, in this case the cuda packet
 *             batcher TM's TV instance.
 * \param p    Pointer the the packet to be buffered.
 * \param data Pointer the the batcher TM thread ctx.
 * \param pq   Pointer to the packetqueue.  We don't need this.
 *
 * \retval TM_ECODE_OK     On success.
 * \retval TM_ECODE_FAILED On failure.
 */
TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *post_pq)
{
#define ALIGN_UP(offset, alignment) \
    (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

    SCCudaPBThreadCtx *tctx = data;

    /* ah.  we have been signalled that we crossed the time limit within which we
     * need to buffer packets.  Let us queue the buffer to the GPU */
    if (queue_buffer) {
        SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
                   "buffer and reseting the alarm");
        SCCudaPBQueueBuffer(tctx);
        SCCudaPBResetBatcherAlarm();
    }

    /* this is possible, since we are using a custom slot function that calls this
     * function, even if it receives no packet from the packet queue */
    if (p == NULL) {
        SCLogDebug("packet NULL inside Cuda batcher TM");
        return TM_ECODE_OK;
    }

    /* we set it for every incoming packet.  We will set this depending on whether
     * we end up buffering the packet or not */
    p->cuda_mpm_enabled = 0;

    /* packets that are too big are handled by the cpu */
    if (p->payload_len > SC_CUDA_PB_MAX_PAYLOAD_SIZE) {
        SCLogDebug("p->payload_len %"PRIu16" > %d, inspecting on the CPU.",
            p->payload_len, SC_CUDA_PB_MAX_PAYLOAD_SIZE);
        return TM_ECODE_OK;
    }

    /* the packets buffer */
    SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb;
    /* the previous packet which has been buffered into the packets_buffer */
    SCCudaPBPacketDataForGPU *prev_buff_packet = NULL;
    /* holds the position in the packets_buffer where the curr packet would
     * be buffered in */
    SCCudaPBPacketDataForGPU *curr_packet = NULL;
    /* the sgh to which the incoming packet belongs */
    SigGroupHead *sgh = NULL;

    if (p->flow != NULL) {
        /* Get the stored sgh from the flow (if any). Make sure we're not using
         * the sgh for icmp error packets part of the same stream. */
        if (p->proto == p->flow->proto) { /* filter out icmp */
            if (p->flowflags & FLOW_PKT_TOSERVER && p->flow->flags & FLOW_SGH_TOSERVER) {
                sgh = p->flow->sgh_toserver;
            } else if (p->flowflags & FLOW_PKT_TOCLIENT && p->flow->flags & FLOW_SGH_TOCLIENT) {
                sgh = p->flow->sgh_toclient;
            }
        }
    }

    if (sgh == NULL) {
        /* get the signature group head to which this packet belongs.  If it belongs
         * to no sgh, we don't need to buffer this packet.
         * \todo Get rid of this, once we get the sgh from the flow */
        sgh = SCCudaPBGetSgh(tctx->de_ctx, p);
        if (sgh == NULL) {
            SCLogDebug("No SigGroupHead match for this packet");
            return TM_ECODE_OK;
        }
    }

    /* if the payload is less than the maximum content length in this sgh we
     * don't need to run the PM on this packet.  Chuck the packet out */
    if (sgh->mpm_content_maxlen > p->payload_len) {
        SCLogDebug("not mpm-inspecting as pkt payload is smaller than "
                   "the largest content length we need to match");
        return TM_ECODE_OK;
    }

    /* if one of these conditions fail we don't have to run the mpm on this
     * packet.  Firstly if the payload_len is == 0, we don't have a payload
     * to match against.  Next if we don't have a mpm_context against this
     * sgh, indicating we don't have any patterns in this sgh, again we don't
     * have anything to run the PM against.  Finally if the flow doesn't want
     * to analyze packets for this flow, we can chuck this packet out as well */
    if ( !(p->payload_len > 0 && sgh->mpm_ctx != NULL &&
           !(p->flags & PKT_NOPAYLOAD_INSPECTION)) ) {
        SCLogDebug("Either p->payload_len <= 0 or mpm_ctx for the packet is NULL "
                   "or PKT_NOPAYLOAD_INSPECTION set for this packet");
        return TM_ECODE_OK;
    }

    /* the cuda b2g context */
    B2gCudaCtx *ctx = sgh->mpm_ctx->ctx;

    /* if we have a 1 byte search kernel set we don't buffer this packet for
     * cuda matching and instead run this non-cuda mpm function to be run on
     * the packet */
    if (ctx->Search == B2gCudaSearch1) {
        SCLogDebug("The packet has a one byte patterns.  run mpm "
                   "separately");
        return TM_ECODE_OK;
    }

#ifdef B2G_CUDA_SEARCH2
    /* if we have a 2 byte search kernel set we don't buffer this packet for
     * cuda matching and instead run this non-cuda mpm function to be run on the
     * packet */
    if (ctx->Search == B2gCudaSearch2) {
        SCLogDebug("The packet has two byte patterns.  run mpm "
                   "separately");
        return TM_ECODE_OK;
    }
#endif

    /* we have passed all the criterions for buffering the packet.  Set the
     * flag indicating that the packet goes through cuda mpm */
    p->cuda_mpm_enabled = 1;

    /* first packet to be buffered in */
    if (pb->nop_in_buffer == 0) {
        curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer;

    /* buffer is not empty */
    } else {
        prev_buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer +
                                                        pb->packets_offset_buffer[pb->nop_in_buffer - 1]);
        curr_packet = (SCCudaPBPacketDataForGPU *)((uint8_t *)prev_buff_packet +
                                                   sizeof(SCCudaPBPacketDataForGPUNonPayload) +
                                                   prev_buff_packet->payload_len) ;
        int diff = (int)((uint8_t *)curr_packet - pb->packets_buffer);
        /* \todo Feel it is the wrong option taken by nvidia by setting CUdeviceptr
         * to unsigned int.  Keep this option for now.  We will get back to this
         * once nvidia responds to the filed bug */
        ALIGN_UP(diff, sizeof(CUdeviceptr));
        curr_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + diff);
    }

    /* store the data in the packets_buffer for this packet, which would be passed
     * over to the GPU for processing */
    curr_packet->m = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->m;
    curr_packet->table = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->cuda_B2G;
    curr_packet->payload_len = p->payload_len;
    memcpy(curr_packet->payload, p->payload, p->payload_len);

    /* store the address of the packet just buffered at the same index.  The
     * dispatcher thread will need this address to communicate the results back
     * to the packet */
    pb->packets_address_buffer[pb->nop_in_buffer] = p;

    /* if it is the first packet to be buffered, the offset is 0.  If it is not,
     * then take the offset for the buffer from curr_packet */
    if (pb->nop_in_buffer == 0) {
        pb->packets_offset_buffer[pb->nop_in_buffer] = 0;
        pb->packets_payload_offset_buffer[pb->nop_in_buffer] = 0;
    } else {
        pb->packets_offset_buffer[pb->nop_in_buffer] = (uint8_t *)curr_packet - pb->packets_buffer;
        pb->packets_payload_offset_buffer[pb->nop_in_buffer] =
            pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] +
            prev_buff_packet->payload_len;
    }

    /* indicates the no of packets added so far into the buffer */
    pb->nop_in_buffer++;

    /* we have hit the threshhold for the total no of packets held in the buffer.
     * We will change this in the future, instead relying on the remaining space
     * left in the buffer or we have been informed that we have hit the time limit
     * to queue the buffer */
    if ( (pb->nop_in_buffer == buffer_packet_threshhold) || queue_buffer) {
        SCLogDebug("Either we have hit the threshold limit for packets(i.e. we "
                   "have %d packets limit) OR we have exceeded the buffering "
                   "time limit.  Buffering the packet buffer and reseting the "
                   "alarm.", buffer_packet_threshhold);
        SCCudaPBQueueBuffer(tctx);
        SCCudaPBResetBatcherAlarm();
    }

    return TM_ECODE_OK;
}

void SCCudaPBThreadExitStats(ThreadVars *tv, void *data)
{
    return;
}

/**
 * \brief The thread de-init function for the cuda packet batcher TM.
 *
 * \param tv   Pointer to the cuda packet batcher TM ThreadVars instance.
 * \param data Pointer the the Thread ctx for the cuda packet batcher TM.
 *
 * \retval TM_ECODE_OK     On success.
 * \retval TM_ECODE_FAILED On failure.  Although we won't be returning this here.
 */
TmEcode SCCudaPBThreadDeInit(ThreadVars *tv, void *data)
{
    SCCudaPBThreadCtx *tctx = data;

    if (tctx != NULL) {
        if (tctx->curr_pb != NULL) {
            if (SCCudaHlPushCudaContextFromModule("SC_CUDA_PACKET_BATCHER") == -1){
                SCLogError(SC_ERR_CUDA_HANDLER_ERROR,
                           "Failed to push cuda context from module");
            }

            SCCudaPBDeAllocSCCudaPBPacketsBuffer(tctx->curr_pb);
            tctx->curr_pb = NULL;

            if (SCCudaCtxPopCurrent(NULL) == -1){
                SCLogError(SC_ERR_CUDA_ERROR, "Failed to pop cuda context");
            }

            if (SCCudaHlDeRegisterModule("SC_CUDA_PACKET_BATCHER") == -1){
                SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Failed to deregister module");
            }
        }
        free(tctx);
    }

    return TM_ECODE_OK;
}

/**
 * \brief Sets up the queues and buffers needed by the cuda batcher TM function.
 */
void SCCudaPBSetUpQueuesAndBuffers(void)
{
    /* the b2g dispatcher thread would have to use the reverse for incoming
     * and outgoing queues */
    char *inq_name = "cuda_batcher_mpm_inqueue";
    char *outq_name = "cuda_batcher_mpm_outqueue";
    int i = 0;

    /* set the incoming queue for the cuda_packet_batcher TM and the cuda B2g
     * dispatcher */
    tmq_inq = TmqGetQueueByName(inq_name);
    if (tmq_inq == NULL) {
        tmq_inq = TmqCreateQueue(inq_name);
        if (tmq_inq == NULL) {
            return;
        }
    }
    tmq_inq->reader_cnt++;
    tmq_inq->writer_cnt++;

    /* set the outgoing queue from the cuda_packet_batcher TM and the cuda B2g
     * dispatcher */
    tmq_outq = TmqGetQueueByName(outq_name);
    if (tmq_outq == NULL) {
        tmq_outq = TmqCreateQueue(outq_name);
        if (tmq_outq == NULL) {
            return;
        }
    }
    tmq_outq->reader_cnt++;
    tmq_outq->writer_cnt++;

    /* Register a new module to be used by the packet batcher to allocate
     * page-locked memory */
    SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");

    profile = SCCudaHlGetProfile("mpm");

    /* allocate the packet buffer */
    /* \todo need to work out the right no of packet buffers that we need to
     * queue.  I doubt we will need more than 4(as long as we don't run it on
     * low traffic line).  We don't want to get into the business of creating
     * new ones, when we run out of buffers, since malloc for a huge chunk
     * like this will take time.  We need to figure out a value based on
     * various other parameters like alarm time and buffer threshold value */
    for (i = 0; i < profile->packet_buffers; i++) {
        if (profile->page_locked) {
            SCLogDebug("Allocating \"%d\" page_locked cuda packet buffers",
                       profile->packet_buffers);
        } else {
            SCLogDebug("Allocating \"%d\" non-page_locked cuda packet buffers",
                       profile->packet_buffers);
        }

        SCCudaPBPacketsBuffer *pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
        /* dump the buffer into the inqueue for this batcher TM.  the batcher
         * thread would be the first consumer for these buffers */
        SCDQDataEnqueue(&data_queues[tmq_inq->id], (SCDQGenericQData *)pb);
    }

    /* \todo This needs to be changed ASAP.  This can't exceed max_pending_packets.
     * Also we need to make this user configurable and allow dynamic updaes
     * based on live traffic */
    buffer_packet_threshhold = profile->packet_buffer_limit;

    return;
}

/**
 * \brief Clean up all the buffers queued in.  Need to write more on this.
 */
void SCCudaPBCleanUpQueuesAndBuffers(void)
{
    SCCudaPBPacketsBuffer *pb = NULL;
    SCDQDataQueue *dq = NULL;

    if (tmq_inq == NULL || tmq_outq == NULL) {
        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments.  tmq_inq or "
                   "tmq_outq NULL");
        return;
    }
    if (SCCudaHlPushCudaContextFromModule("SC_CUDA_PACKET_BATCHER") == -1){
        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Could not push cuda context from module");
    }

    /* clean all the buffers present in the inq */
    dq = &data_queues[tmq_inq->id];
    SCMutexLock(&dq->mutex_q);
    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
        if (pb->packets_buffer != NULL) {
            if (profile->page_locked) {
                if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                               "packets_buffer");
                }
            } else {
                free(pb->packets_buffer);
            }
        }
        if (pb->packets_offset_buffer != NULL) {
            if (profile->page_locked) {
                if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                               "packets_offset_buffer");
                }
            } else {
                free(pb->packets_offset_buffer);
            }
        }
        if (pb->packets_payload_offset_buffer != NULL) {
            if (profile->page_locked) {
                if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
                    SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                               "packets_payload_offset_buffer");
                }
            } else {
                free(pb->packets_payload_offset_buffer);
            }
        }

        free(pb);
    }
    SCMutexUnlock(&dq->mutex_q);
    SCCondSignal(&dq->cond_q);

    /* clean all the buffers present in the outq */
    dq = &data_queues[tmq_outq->id];
    SCMutexLock(&dq->mutex_q);
    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
        if (pb->packets_buffer != NULL) {
            if (SCCudaMemFreeHost(pb->packets_buffer) == -1) {
                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                           "packets_buffer");
            }
        }
        if (pb->packets_offset_buffer != NULL) {
            if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1) {
                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                           "packets_offset_buffer");
            }
        }
        if (pb->packets_payload_offset_buffer != NULL) {
            if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1) {
                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
                           "packets_payload_offset_buffer");
            }
        }

        free(pb);
    }
    if (SCCudaCtxPopCurrent(NULL) == -1){
        SCLogError(SC_ERR_CUDA_ERROR, "Could not pop cuda context");
    }
    SCMutexUnlock(&dq->mutex_q);
    SCCondSignal(&dq->cond_q);

    return;
}

/**
 * \brief Function used to set the packet threshhold limit in the packets buffer.
 *
 * \param threshhold_override The threshhold limit for the packets_buffer.
 */
void SCCudaPBSetBufferPacketThreshhold(uint32_t threshhold_override)
{
    buffer_packet_threshhold = threshhold_override;

    return;
}

/**
 * \brief Function used to set the profile for cuda packet batcher.  Used
 *        for unittests alone.
 */
void SCCudaPBSetProfile(char *profile_name)
{
    profile = SCCudaHlGetProfile("mpm");

    return;
}

/**
 * \brief Used to inform the cuda packet batcher that packet batching shouldn't
 *        be done anymore and set the flag to indicate this.  We also need to
 *        signal the cuda batcher data inq, in case it is waiting on the inq
 *        for a new free packet buffer.
 */
void SCCudaPBKillBatchingPackets(void)
{
    run_batcher = 0;
    SCDQDataQueue *dq = &data_queues[tmq_inq->id];
    SCCondSignal(&dq->cond_q);

    return;
}

void SCCudaPBRunningTests(int status)
{
    unittest_mode = status;
}

/***********************************Unittests**********************************/

#ifdef UNITTESTS

int SCCudaPBTest01(void)
{
#define ALIGN_UP(offset, alignment) \
    (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

    uint8_t raw_eth[] = {
        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
        0x0d, 0x0a };

    int result = 0;
    SCCudaPBThreadCtx *tctx = NULL;

    Packet *p = SCMalloc(SIZE_OF_PACKET);
    if (p == NULL)
    return 0;
    DecodeThreadVars dtv;
    ThreadVars tv;
    ThreadVars tv_cuda_PB;
    DetectEngineCtx *de_ctx = NULL;

    SCCudaPBPacketsBuffer *pb = NULL;
    SCCudaPBPacketDataForGPU *buff_packet = NULL;
    SCDQDataQueue *dq = NULL;

    uint32_t i = 0;

    char *strings[] = {"test_one",
                       "test_two",
                       "test_three",
                       "test_four",
                       "test_five",
                       "test_six",
                       "test_seven",
                       "test_eight",
                       "test_nine",
                       "test_ten"};

    uint32_t packets_payload_offset_buffer[sizeof(strings)/sizeof(char *)];
    memset(packets_payload_offset_buffer, 0, sizeof(packets_payload_offset_buffer));
    uint32_t packets_offset_buffer[sizeof(strings)/sizeof(char *)];
    memset(packets_offset_buffer, 0, sizeof(packets_offset_buffer));

    uint32_t packets_total_payload_len = 0;
    uint32_t packets_buffer_len = 0;

    for (i = 0; i < sizeof(strings)/sizeof(char *); i++) {
        packets_total_payload_len += strlen(strings[i]);
    }

    for (i = 1; i < sizeof(strings)/sizeof(char *); i++) {
        packets_payload_offset_buffer[i] = packets_payload_offset_buffer[i - 1] + strlen(strings[i - 1]);
        packets_offset_buffer[i] = packets_offset_buffer[i - 1] +
            sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[i - 1]);
        ALIGN_UP(packets_offset_buffer[i], sizeof(CUdeviceptr));
    }
    packets_buffer_len += packets_offset_buffer[(sizeof(strings)/sizeof(char *)) - 1] +
        sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[(sizeof(strings)/sizeof(char *)) - 1]);

    memset(p, 0, SIZE_OF_PACKET);
    p->pkt = (uint8_t *)(p + 1);
    memset(&dtv, 0, sizeof(DecodeThreadVars));
    memset(&tv, 0, sizeof(ThreadVars));
    memset(&tv_cuda_PB, 0, sizeof(ThreadVars));

    FlowInitConfig(FLOW_QUIET);
    DecodeEthernet(&tv, &dtv, p, raw_eth, sizeof(raw_eth), NULL);

    de_ctx = DetectEngineCtxInit();
    if (de_ctx == NULL) {
        goto end;
    }

    de_ctx->mpm_matcher = MPM_B2G_CUDA;
    de_ctx->flags |= DE_QUIET;

    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
                               "content:test; sid:1;)");
    if (de_ctx->sig_list == NULL) {
        printf("signature parsing failed\n");
        goto end;
    }
    SigGroupBuild(de_ctx);

    result = 1;

    SCCudaPBSetUpQueuesAndBuffers();
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 10);
    SCCudaPBRunningTests(1);
    SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx);
    SCCudaPBSetBufferPacketThreshhold(sizeof(strings)/sizeof(char *));

    p->payload = (uint8_t *)strings[0];
    p->payload_len = strlen(strings[0]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[1];
    p->payload_len = strlen(strings[1]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[2];
    p->payload_len = strlen(strings[2]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[3];
    p->payload_len = strlen(strings[3]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[4];
    p->payload_len = strlen(strings[4]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[5];
    p->payload_len = strlen(strings[5]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[6];
    p->payload_len = strlen(strings[6]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[7];
    p->payload_len = strlen(strings[7]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[8];
    p->payload_len = strlen(strings[8]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    p->payload = (uint8_t *)strings[9];
    p->payload_len = strlen(strings[9]);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 1);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 8);

    dq = &data_queues[tmq_outq->id];
    pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq);
    if (pb == NULL) {
        result = 0;
        goto end;
    }
    result &= (dq->len == 0);
    result &= (pb->nop_in_buffer == 10);
    if (result == 0)
        goto end;

    for (i = 0; i < pb->nop_in_buffer; i++) {
        buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + pb->packets_offset_buffer[i]);
        result &= (strlen(strings[i]) == buff_packet->payload_len);
        result &= (memcmp(strings[i], buff_packet->payload, buff_packet->payload_len) == 0);
        if (result == 0)
            goto end;
        result &= (packets_payload_offset_buffer[i] == pb->packets_payload_offset_buffer[i]);
        result &= (packets_offset_buffer[i] == pb->packets_offset_buffer[i]);
    }
    result &= (packets_total_payload_len == pb->packets_total_payload_len);
    result &= (packets_buffer_len == pb->packets_buffer_len);

 end:
    SCCudaPBCleanUpQueuesAndBuffers();
    if (de_ctx) {
        SigGroupCleanup(de_ctx);
        SigCleanSignatures(de_ctx);
        DetectEngineCtxFree(de_ctx);
    }

    SCCudaPBThreadDeInit(NULL, tctx);
    SCFree(p);
    return result;
}

int SCCudaPBTest02(void)
{
    uint8_t raw_eth[] = {
        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
        0x0d, 0x0a };

    int result = 0;
    const char *string = NULL;
    SCCudaPBThreadCtx *tctx = NULL;

    Packet *p = SCMalloc(SIZE_OF_PACKET);
    if (p == NULL)
        return 0;
    DecodeThreadVars dtv;
    ThreadVars tv;
    ThreadVars tv_cuda_PB;
    DetectEngineCtx *de_ctx = NULL;

    SCCudaPBPacketsBuffer *pb = NULL;
    SCDQDataQueue *dq = NULL;


    memset(p, 0, SIZE_OF_PACKET);
    p->pkt = (uint8_t *)(p + 1);
    memset(&dtv, 0, sizeof(DecodeThreadVars));
    memset(&tv, 0, sizeof(ThreadVars));
    memset(&tv_cuda_PB, 0, sizeof(ThreadVars));

    FlowInitConfig(FLOW_QUIET);
    DecodeEthernet(&tv, &dtv, p, raw_eth, sizeof(raw_eth), NULL);

    de_ctx = DetectEngineCtxInit();
    if (de_ctx == NULL) {
        goto end;
    }

    de_ctx->mpm_matcher = MPM_B2G_CUDA;
    de_ctx->flags |= DE_QUIET;

    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any 5555 -> any any (msg:\"Bamboo\"; "
                               "content:test; sid:1;)");
    if (de_ctx->sig_list == NULL) {
        printf("signature parsing failed\n");
        goto end;
    }
    SigGroupBuild(de_ctx);

    SCCudaPBSetUpQueuesAndBuffers();
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 10);
    SCCudaPBRunningTests(1);
    SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx);

    result = 1;

    string = "test_one";
    p->payload = (uint8_t *)string;
    p->payload_len = strlen(string);
    SCCudaPBBatchPackets(NULL, p, tctx, NULL, NULL);
    dq = &data_queues[tmq_outq->id];
    result &= (dq->len == 0);
    dq = &data_queues[tmq_inq->id];
    result &= (dq->len == 9);

    pb = tctx->curr_pb;
    result &= (pb->nop_in_buffer == 0);

 end:
    SCCudaPBCleanUpQueuesAndBuffers();
    if (de_ctx) {
        SigGroupCleanup(de_ctx);
        SigCleanSignatures(de_ctx);
        DetectEngineCtxFree(de_ctx);
    }

    SCCudaPBThreadDeInit(NULL, tctx);
    SCFree(p);
    return result;
}

#endif /* UNITTESTS */

void SCCudaPBRegisterTests(void)
{

#ifdef UNITTESTS
    UtRegisterTest("SCCudaPBTest01", SCCudaPBTest01, 1);
    UtRegisterTest("SCCudaPBTest02", SCCudaPBTest02, 1);
#endif

    return;
}

#endif /* __SC_CUDA_SUPPORT__ */