Add memcmp functions for SSE4.1 and SSE4.2.

remotes/origin/master-1.1.x
Victor Julien 15 years ago
parent 1859ed54c7
commit a75556dfe5

@ -20,22 +20,194 @@
*
* \author Victor Julien <victor@inliniac.net>
*
* Memcmp implementations.
* Memcmp implementations for SSE3, SSE4.1 and SSE4.2.
*/
#ifndef __UTIL_MEMCMP_H__
#define __UTIL_MEMCMP_H__
/** \brief compare two patterns, converting the 2nd to lowercase
* \warning *ONLY* the 2nd pattern is converted to lowercase
*/
static inline int SCMemcmpLowercase(void *, void *, size_t);
void MemcmpRegisterTests(void);
#if defined(__SSE3__)
#if defined(__SSE4_2__)
#include <pmmintrin.h> /* for SSE3 */
#include <nmmintrin.h>
static inline int SCMemcmp(void *s1, void *s2, size_t n)
{
__m128i b1, b2;
int r;
/* counter for how far we already matched in the buffer */
size_t m = 0;
do {
/* load the buffers into the 128bit vars */
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
/* do the actual compare */
m += (r = _mm_cmpestri(b1, n - m, b2, 16,
_SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
s1 += 16;
s2 += 16;
} while (r == 16);
return ((m == n) ? 0 : 1);
}
/* Range of values of uppercase characters */
static char scmemcmp_uppercase[2] __attribute__((aligned(16))) = {
'A', 'Z' };
/** \brief compare two buffers in a case insensitive way
* \param s1 buffer already in lowercase
* \param s2 buffer with mixed upper and lowercase
*/
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t n)
{
__m128i b1, b2, mask;
int r;
/* counter for how far we already matched in the buffer */
size_t m = 0;
__m128i ucase = _mm_load_si128((const __m128i *) scmemcmp_uppercase);
__m128i nulls = _mm_setzero_si128();
__m128i uplow = _mm_set1_epi8(0x20);
do {
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
size_t len = n - m;
/* The first step is creating a mask that is FF for all uppercase
* characters, 00 for all others */
mask = _mm_cmpestrm(ucase, 2, b2, len, _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
/* Next we use that mask to create a new: this one has 0x20 for
* the uppercase chars, 00 for all other. */
mask = _mm_blendv_epi8(nulls, uplow, mask);
/* finally, merge the mask and the buffer converting the
* uppercase to lowercase */
b2 = _mm_add_epi8(b2, mask);
/* search using our converted buffer */
m += (r = _mm_cmpestri(b1, len, b2, 16,
_SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
s1 += 16;
s2 += 16;
} while (r == 16);
return ((m == n) ? 0 : 1);
}
#elif defined(__SSE4_1__)
#include <smmintrin.h>
#define SCMEMCMP_BYTES 16
static inline int SCMemcmp(void *, void *, size_t);
static inline int SCMemcmpLowercase(void *, void *, size_t);
static inline int SCMemcmp(void *s1, void *s2, size_t len) {
size_t offset = 0;
__m128i b1, b2, c;
do {
/* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
* _mm_lddqu_si128 was about 2% slower even though it's supposed to
* be faster. */
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
c = _mm_cmpeq_epi8(b1, b2);
int diff = len - offset;
if (diff < 16) {
int rmask = ~(0xFFFFFFFF << diff);
if ((_mm_movemask_epi8(c) & rmask) != rmask) {
return 1;
}
} else {
if (_mm_movemask_epi8(c) != 0x0000FFFF) {
return 1;
}
}
offset += SCMEMCMP_BYTES;
s1 += SCMEMCMP_BYTES;
s2 += SCMEMCMP_BYTES;
} while (len > offset);
return 0;
}
#define UPPER_LOW 0x40 /* "A" - 1 */
#define UPPER_HIGH 0x5B /* "Z" + 1 */
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
size_t offset = 0;
__m128i b1, b2, mask1, mask2, upper1, upper2, nulls, uplow;
/* setup registers for upper to lower conversion */
upper1 = _mm_set1_epi8(UPPER_LOW);
upper2 = _mm_set1_epi8(UPPER_HIGH);
nulls = _mm_setzero_si128();
uplow = _mm_set1_epi8(0x20);
do {
/* unaligned loading of the bytes to compare */
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
/* mark all chars bigger than upper1 */
mask1 = _mm_cmpgt_epi8(b2, upper1);
/* mark all chars lower than upper2 */
mask2 = _mm_cmplt_epi8(b2, upper2);
/* merge the two, leaving only those that are true in both */
mask1 = _mm_cmpeq_epi8(mask1, mask2);
/* Next we use that mask to create a new: this one has 0x20 for
* the uppercase chars, 00 for all other. */
mask1 = _mm_blendv_epi8(nulls, uplow, mask1);
/* add to b2, converting uppercase to lowercase */
b2 = _mm_add_epi8(b2, mask1);
/* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
mask1 = _mm_cmpeq_epi8(b1, b2);
int diff = len - offset;
if (diff < 16) {
int rmask = ~(0xFFFFFFFF << diff);
if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
return 1;
}
} else {
if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
return 1;
}
}
offset += SCMEMCMP_BYTES;
s1 += SCMEMCMP_BYTES;
s2 += SCMEMCMP_BYTES;
} while (len > offset);
return 0;
}
#elif defined(__SSE3__)
#include <pmmintrin.h> /* for SSE3 */
#define SCMEMCMP_BYTES 16
static inline int SCMemcmp(void *s1, void *s2, size_t len) {
size_t offset = 0;
@ -128,7 +300,7 @@ static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
#else
/* No SIMD support */
/* No SIMD support, fall back to plain memcmp and a home grown lowercase one */
#define SCMemcmp memcmp
@ -147,7 +319,7 @@ SCMemcmpLowercase(void *s1, void *s2, size_t n) {
return 0;
}
#endif /* __SSE3__ */
#endif /* SIMD */
#endif /* __UTIL_MEMCMP_H__ */

@ -41,6 +41,7 @@
#include "util-unittest.h"
#include "util-hashlist.h"
#include "util-optimize.h"
#include "util-memcmp.h"
#include "conf.h"
@ -1093,7 +1094,7 @@ uint32_t B2gcSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, PatternM
prt(pattern, hdr->len);printf("\n");
prt(buf+pos-1, hdr->len);printf("\n");
#endif
if (memcmp_lowercase(pattern, buf+pos-1, hdr->len) == 0) {
if (SCMemcmpLowercase(pattern, buf+pos-1, hdr->len) == 0) {
matches += MpmVerifyMatch(mpm_thread_ctx, pmq, hdr->id);
}
} else {
@ -1103,7 +1104,7 @@ uint32_t B2gcSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, PatternM
prt(pattern, hdr->len);printf("\n");
prt(buf+pos-1, hdr->len);printf("\n");
#endif
if (memcmp(pattern, buf+pos-1, hdr->len) == 0) {
if (SCMemcmp(pattern, buf+pos-1, hdr->len) == 0) {
matches += MpmVerifyMatch(mpm_thread_ctx, pmq, hdr->id);
}
}

Loading…
Cancel
Save