From a75556dfe5b773b2a3a6317ddeb9198ae19d9b95 Mon Sep 17 00:00:00 2001 From: Victor Julien Date: Mon, 20 Sep 2010 12:38:37 +0200 Subject: [PATCH] Add memcmp functions for SSE4.1 and SSE4.2. --- src/util-memcmp.h | 186 ++++++++++++++++++++++++++++++++++++++++++-- src/util-mpm-b2gc.c | 5 +- 2 files changed, 182 insertions(+), 9 deletions(-) diff --git a/src/util-memcmp.h b/src/util-memcmp.h index efbf976619..6883c44c60 100644 --- a/src/util-memcmp.h +++ b/src/util-memcmp.h @@ -20,22 +20,194 @@ * * \author Victor Julien * - * Memcmp implementations. + * Memcmp implementations for SSE3, SSE4.1 and SSE4.2. */ #ifndef __UTIL_MEMCMP_H__ #define __UTIL_MEMCMP_H__ +/** \brief compare two patterns, converting the 2nd to lowercase + * \warning *ONLY* the 2nd pattern is converted to lowercase + */ +static inline int SCMemcmpLowercase(void *, void *, size_t); + void MemcmpRegisterTests(void); -#if defined(__SSE3__) +#if defined(__SSE4_2__) -#include /* for SSE3 */ +#include + +static inline int SCMemcmp(void *s1, void *s2, size_t n) +{ + __m128i b1, b2; + + int r; + /* counter for how far we already matched in the buffer */ + size_t m = 0; + + do { + /* load the buffers into the 128bit vars */ + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + + /* do the actual compare */ + m += (r = _mm_cmpestri(b1, n - m, b2, 16, + _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY)); + + s1 += 16; + s2 += 16; + } while (r == 16); + + return ((m == n) ? 0 : 1); +} + +/* Range of values of uppercase characters */ +static char scmemcmp_uppercase[2] __attribute__((aligned(16))) = { + 'A', 'Z' }; + +/** \brief compare two buffers in a case insensitive way + * \param s1 buffer already in lowercase + * \param s2 buffer with mixed upper and lowercase + */ +static inline int SCMemcmpLowercase(void *s1, void *s2, size_t n) +{ + __m128i b1, b2, mask; + + int r; + /* counter for how far we already matched in the buffer */ + size_t m = 0; + + __m128i ucase = _mm_load_si128((const __m128i *) scmemcmp_uppercase); + __m128i nulls = _mm_setzero_si128(); + __m128i uplow = _mm_set1_epi8(0x20); + + do { + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + size_t len = n - m; + + /* The first step is creating a mask that is FF for all uppercase + * characters, 00 for all others */ + mask = _mm_cmpestrm(ucase, 2, b2, len, _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + /* Next we use that mask to create a new: this one has 0x20 for + * the uppercase chars, 00 for all other. */ + mask = _mm_blendv_epi8(nulls, uplow, mask); + /* finally, merge the mask and the buffer converting the + * uppercase to lowercase */ + b2 = _mm_add_epi8(b2, mask); + + /* search using our converted buffer */ + m += (r = _mm_cmpestri(b1, len, b2, 16, + _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY)); + + s1 += 16; + s2 += 16; + } while (r == 16); + + return ((m == n) ? 0 : 1); +} + +#elif defined(__SSE4_1__) + +#include #define SCMEMCMP_BYTES 16 -static inline int SCMemcmp(void *, void *, size_t); -static inline int SCMemcmpLowercase(void *, void *, size_t); +static inline int SCMemcmp(void *s1, void *s2, size_t len) { + size_t offset = 0; + __m128i b1, b2, c; + + do { + /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using + * _mm_lddqu_si128 was about 2% slower even though it's supposed to + * be faster. */ + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + c = _mm_cmpeq_epi8(b1, b2); + + int diff = len - offset; + if (diff < 16) { + int rmask = ~(0xFFFFFFFF << diff); + + if ((_mm_movemask_epi8(c) & rmask) != rmask) { + return 1; + } + } else { + if (_mm_movemask_epi8(c) != 0x0000FFFF) { + return 1; + } + } + + offset += SCMEMCMP_BYTES; + s1 += SCMEMCMP_BYTES; + s2 += SCMEMCMP_BYTES; + } while (len > offset); + + return 0; +} + +#define UPPER_LOW 0x40 /* "A" - 1 */ +#define UPPER_HIGH 0x5B /* "Z" + 1 */ + +static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) { + size_t offset = 0; + __m128i b1, b2, mask1, mask2, upper1, upper2, nulls, uplow; + + /* setup registers for upper to lower conversion */ + upper1 = _mm_set1_epi8(UPPER_LOW); + upper2 = _mm_set1_epi8(UPPER_HIGH); + nulls = _mm_setzero_si128(); + uplow = _mm_set1_epi8(0x20); + + do { + /* unaligned loading of the bytes to compare */ + b1 = _mm_loadu_si128((const __m128i *) s1); + b2 = _mm_loadu_si128((const __m128i *) s2); + + /* mark all chars bigger than upper1 */ + mask1 = _mm_cmpgt_epi8(b2, upper1); + /* mark all chars lower than upper2 */ + mask2 = _mm_cmplt_epi8(b2, upper2); + /* merge the two, leaving only those that are true in both */ + mask1 = _mm_cmpeq_epi8(mask1, mask2); + /* Next we use that mask to create a new: this one has 0x20 for + * the uppercase chars, 00 for all other. */ + mask1 = _mm_blendv_epi8(nulls, uplow, mask1); + + /* add to b2, converting uppercase to lowercase */ + b2 = _mm_add_epi8(b2, mask1); + + /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */ + mask1 = _mm_cmpeq_epi8(b1, b2); + + int diff = len - offset; + if (diff < 16) { + int rmask = ~(0xFFFFFFFF << diff); + + if ((_mm_movemask_epi8(mask1) & rmask) != rmask) { + return 1; + } + } else { + if (_mm_movemask_epi8(mask1) != 0x0000FFFF) { + return 1; + } + } + + offset += SCMEMCMP_BYTES; + s1 += SCMEMCMP_BYTES; + s2 += SCMEMCMP_BYTES; + } while (len > offset); + + return 0; +} + + + +#elif defined(__SSE3__) + +#include /* for SSE3 */ + +#define SCMEMCMP_BYTES 16 static inline int SCMemcmp(void *s1, void *s2, size_t len) { size_t offset = 0; @@ -128,7 +300,7 @@ static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) { #else -/* No SIMD support */ +/* No SIMD support, fall back to plain memcmp and a home grown lowercase one */ #define SCMemcmp memcmp @@ -147,7 +319,7 @@ SCMemcmpLowercase(void *s1, void *s2, size_t n) { return 0; } -#endif /* __SSE3__ */ +#endif /* SIMD */ #endif /* __UTIL_MEMCMP_H__ */ diff --git a/src/util-mpm-b2gc.c b/src/util-mpm-b2gc.c index 6739eaea44..3fef187259 100644 --- a/src/util-mpm-b2gc.c +++ b/src/util-mpm-b2gc.c @@ -41,6 +41,7 @@ #include "util-unittest.h" #include "util-hashlist.h" #include "util-optimize.h" +#include "util-memcmp.h" #include "conf.h" @@ -1093,7 +1094,7 @@ uint32_t B2gcSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, PatternM prt(pattern, hdr->len);printf("\n"); prt(buf+pos-1, hdr->len);printf("\n"); #endif - if (memcmp_lowercase(pattern, buf+pos-1, hdr->len) == 0) { + if (SCMemcmpLowercase(pattern, buf+pos-1, hdr->len) == 0) { matches += MpmVerifyMatch(mpm_thread_ctx, pmq, hdr->id); } } else { @@ -1103,7 +1104,7 @@ uint32_t B2gcSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, PatternM prt(pattern, hdr->len);printf("\n"); prt(buf+pos-1, hdr->len);printf("\n"); #endif - if (memcmp(pattern, buf+pos-1, hdr->len) == 0) { + if (SCMemcmp(pattern, buf+pos-1, hdr->len) == 0) { matches += MpmVerifyMatch(mpm_thread_ctx, pmq, hdr->id); } }