CUDA: Update the inspection engine to inform the cuda module that it

doesn't need the gpu results and to release the packet for the next run. Previously the inspection engine wouldn't inform the cuda module, if it didn't need the results. As a consequence, when the packet is next taken for re-use, and if the packet is still being processed by the cuda module, the engine would wait till the cuda module frees the packet. This commits updates this functionality to inform the cuda module to release the packet for the afore-mentioned case.
11 years ago · b334b8a6e9
parent 60c46170b0
commit b334b8a6e9
3 changed files with 25 additions and 1 deletions
--- a/src/detect.c
+++ b/src/detect.c
@ -1572,6 +1572,10 @@ next:
    PACKET_PROFILING_DETECT_END(p, PROF_DETECT_RULES);

 end:
+#ifdef __SC_CUDA_SUPPORT__
+    CudaReleasePacket(p);
+#endif
+
    /* see if we need to increment the inspect_id and reset the de_state */
    if (has_state && AppLayerParserProtocolSupportsTxs(p->proto, alproto)) {
        PACKET_PROFILING_DETECT_START(p, PROF_DETECT_STATEFUL);
--- a/src/util-mpm-ac.c
+++ b/src/util-mpm-ac.c
@ -1691,6 +1691,18 @@ void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)

 }

+void CudaReleasePacket(Packet *p)
+{
+    if (p->cuda_pkt_vars.cuda_mpm_enabled == 1) {
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+        p->cuda_pkt_vars.cuda_done = 0;
+        SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+    }
+
+    return;
+}
+
 /* \todos
 * - Use texture memory - Can we fit all the arrays into a 3d texture.
 *   Texture memory definitely offers slightly better performance even
@ -1890,6 +1902,13 @@ static void *SCACCudaDispatcher(void *arg)
        for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
            Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];

+            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+            if (p->cuda_pkt_vars.cuda_mpm_enabled == 0) {
+                p->cuda_pkt_vars.cuda_done = 0;
+                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+                continue;
+            }
+
            p->cuda_pkt_vars.cuda_gpu_matches =
                cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
            if (p->cuda_pkt_vars.cuda_gpu_matches != 0) {
@ -1900,7 +1919,6 @@ static void *SCACCudaDispatcher(void *arg)
                                                d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
            }

-            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
            p->cuda_pkt_vars.cuda_done = 1;
            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
            SCCondSignal(&p->cuda_pkt_vars.cuda_cond);
--- a/src/util-mpm-ac.h
+++ b/src/util-mpm-ac.h
@ -205,6 +205,8 @@ uint32_t  SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
                                          PatternMatcherQueue *pmq);
 void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx);

+void CudaReleasePacket(Packet *p);
+
 #endif /* __SC_CUDA_SUPPORT__ */