From b963890de16f4f27604e9b59d5a842349df89fc4 Mon Sep 17 00:00:00 2001 From: Victor Julien Date: Wed, 2 Feb 2011 10:57:03 +0100 Subject: [PATCH] Reenable SSE3 memcmp and switch AC memcmp to use the SCMemcmp wrapper. --- src/util-memcmp.h | 190 +++++++++++++++++++++++----------------------- src/util-mpm-ac.c | 5 +- 2 files changed, 98 insertions(+), 97 deletions(-) diff --git a/src/util-memcmp.h b/src/util-memcmp.h index 37078e4d19..ba44e078cd 100644 --- a/src/util-memcmp.h +++ b/src/util-memcmp.h @@ -206,101 +206,101 @@ static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) { -//#elif defined(__SSE3__) -// -//#include /* for SSE3 */ -// -//#define SCMEMCMP_BYTES 16 -// -//static inline int SCMemcmp(void *s1, void *s2, size_t len) { -// size_t offset = 0; -// __m128i b1, b2, c; -// -// do { -// /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using -// * _mm_lddqu_si128 was about 2% slower even though it's supposed to -// * be faster. */ -// b1 = _mm_loadu_si128((const __m128i *) s1); -// b2 = _mm_loadu_si128((const __m128i *) s2); -// c = _mm_cmpeq_epi8(b1, b2); -// -// int diff = len - offset; -// if (diff < 16) { -// int rmask = ~(0xFFFFFFFF << diff); -// -// if ((_mm_movemask_epi8(c) & rmask) != rmask) { -// return 1; -// } -// } else { -// if (_mm_movemask_epi8(c) != 0x0000FFFF) { -// return 1; -// } -// } -// -// offset += SCMEMCMP_BYTES; -// s1 += SCMEMCMP_BYTES; -// s2 += SCMEMCMP_BYTES; -// } while (len > offset); -// -// return 0; -//} -// -//#define UPPER_LOW 0x40 /* "A" - 1 */ -//#define UPPER_HIGH 0x5B /* "Z" + 1 */ -//#define UPPER_DELTA 0xDF /* 0xFF - 0x20 */ -// -//static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) { -// size_t offset = 0; -// __m128i b1, b2, mask1, mask2, upper1, upper2, delta; -// -// /* setup registers for upper to lower conversion */ -// upper1 = _mm_set1_epi8(UPPER_LOW); -// upper2 = _mm_set1_epi8(UPPER_HIGH); -// delta = _mm_set1_epi8(UPPER_DELTA); -// -// do { -// /* unaligned loading of the bytes to compare */ -// b1 = _mm_loadu_si128((const __m128i *) s1); -// b2 = _mm_loadu_si128((const __m128i *) s2); -// -// /* mark all chars bigger than upper1 */ -// mask1 = _mm_cmpgt_epi8(b2, upper1); -// /* mark all chars lower than upper2 */ -// mask2 = _mm_cmplt_epi8(b2, upper2); -// /* merge the two, leaving only those that are true in both */ -// mask1 = _mm_cmpeq_epi8(mask1, mask2); -// -// /* sub delta leaves 0x20 only for uppercase positions, the -// rest is 0x00 due to the saturation (reuse mask1 reg)*/ -// mask1 = _mm_subs_epu8(mask1, delta); -// -// /* add to b2, converting uppercase to lowercase */ -// b2 = _mm_add_epi8(b2, mask1); -// -// /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */ -// mask1 = _mm_cmpeq_epi8(b1, b2); -// -// int diff = len - offset; -// if (diff < 16) { -// int rmask = ~(0xFFFFFFFF << diff); -// -// if ((_mm_movemask_epi8(mask1) & rmask) != rmask) { -// return 1; -// } -// } else { -// if (_mm_movemask_epi8(mask1) != 0x0000FFFF) { -// return 1; -// } -// } -// -// offset += SCMEMCMP_BYTES; -// s1 += SCMEMCMP_BYTES; -// s2 += SCMEMCMP_BYTES; -// } while (len > offset); -// -// return 0; -//} -// +#elif defined(__SSE3__) + +#include /* for SSE3 */ + +#define SCMEMCMP_BYTES 16 + +static inline int SCMemcmp(void *s1, void *s2, size_t len) { + size_t offset = 0; + __m128i b1, b2, c; + + do { + /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using + * _mm_lddqu_si128 was about 2% slower even though it's supposed to + * be faster. */ + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + c = _mm_cmpeq_epi8(b1, b2); + + int diff = len - offset; + if (diff < 16) { + int rmask = ~(0xFFFFFFFF << diff); + + if ((_mm_movemask_epi8(c) & rmask) != rmask) { + return 1; + } + } else { + if (_mm_movemask_epi8(c) != 0x0000FFFF) { + return 1; + } + } + + offset += SCMEMCMP_BYTES; + s1 += SCMEMCMP_BYTES; + s2 += SCMEMCMP_BYTES; + } while (len > offset); + + return 0; +} + +#define UPPER_LOW 0x40 /* "A" - 1 */ +#define UPPER_HIGH 0x5B /* "Z" + 1 */ +#define UPPER_DELTA 0xDF /* 0xFF - 0x20 */ + +static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) { + size_t offset = 0; + __m128i b1, b2, mask1, mask2, upper1, upper2, delta; + + /* setup registers for upper to lower conversion */ + upper1 = _mm_set1_epi8(UPPER_LOW); + upper2 = _mm_set1_epi8(UPPER_HIGH); + delta = _mm_set1_epi8(UPPER_DELTA); + + do { + /* unaligned loading of the bytes to compare */ + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + + /* mark all chars bigger than upper1 */ + mask1 = _mm_cmpgt_epi8(b2, upper1); + /* mark all chars lower than upper2 */ + mask2 = _mm_cmplt_epi8(b2, upper2); + /* merge the two, leaving only those that are true in both */ + mask1 = _mm_cmpeq_epi8(mask1, mask2); + + /* sub delta leaves 0x20 only for uppercase positions, the + rest is 0x00 due to the saturation (reuse mask1 reg)*/ + mask1 = _mm_subs_epu8(mask1, delta); + + /* add to b2, converting uppercase to lowercase */ + b2 = _mm_add_epi8(b2, mask1); + + /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */ + mask1 = _mm_cmpeq_epi8(b1, b2); + + int diff = len - offset; + if (diff < 16) { + int rmask = ~(0xFFFFFFFF << diff); + + if ((_mm_movemask_epi8(mask1) & rmask) != rmask) { + return 1; + } + } else { + if (_mm_movemask_epi8(mask1) != 0x0000FFFF) { + return 1; + } + } + + offset += SCMEMCMP_BYTES; + s1 += SCMEMCMP_BYTES; + s2 += SCMEMCMP_BYTES; + } while (len > offset); + + return 0; +} + #else /* No SIMD support, fall back to plain memcmp and a home grown lowercase one */ diff --git a/src/util-mpm-ac.c b/src/util-mpm-ac.c index 5af8ffaef6..a723cf9bfe 100644 --- a/src/util-mpm-ac.c +++ b/src/util-mpm-ac.c @@ -54,6 +54,7 @@ #include "conf.h" #include "util-debug.h" #include "util-unittest.h" +#include "util-memcmp.h" void SCACInitCtx(MpmCtx *, int); void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t); @@ -1206,7 +1207,7 @@ uint32_t SCACSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, uint32_t *pids = ctx->output_table[state].pids; for (k = 0; k < no_of_entries; k++) { if (pids[k] & 0xFFFF0000) { - if (memcmp(ctx->pid_pat_list[pids[k] & 0x0000FFFF].cs, + if (SCMemcmp(ctx->pid_pat_list[pids[k] & 0x0000FFFF].cs, buf + i - ctx->pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1, ctx->pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) { /* inside loop */ @@ -1237,7 +1238,7 @@ uint32_t SCACSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, uint32_t k; for (k = 0; k < no_of_entries; k++) { if (pids[k] & 0xFFFF0000) { - if (memcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs, + if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs, buf + i - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1, pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) { /* inside loop */