Reenable SSE3 memcmp and switch AC memcmp to use the SCMemcmp wrapper.

remotes/origin/master-1.1.x
Victor Julien 14 years ago
parent 6f58ef13c4
commit b963890de1

@ -206,101 +206,101 @@ static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
//#elif defined(__SSE3__)
//
//#include <pmmintrin.h> /* for SSE3 */
//
//#define SCMEMCMP_BYTES 16
//
//static inline int SCMemcmp(void *s1, void *s2, size_t len) {
// size_t offset = 0;
// __m128i b1, b2, c;
//
// do {
// /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
// * _mm_lddqu_si128 was about 2% slower even though it's supposed to
// * be faster. */
// b1 = _mm_loadu_si128((const __m128i *) s1);
// b2 = _mm_loadu_si128((const __m128i *) s2);
// c = _mm_cmpeq_epi8(b1, b2);
//
// int diff = len - offset;
// if (diff < 16) {
// int rmask = ~(0xFFFFFFFF << diff);
//
// if ((_mm_movemask_epi8(c) & rmask) != rmask) {
// return 1;
// }
// } else {
// if (_mm_movemask_epi8(c) != 0x0000FFFF) {
// return 1;
// }
// }
//
// offset += SCMEMCMP_BYTES;
// s1 += SCMEMCMP_BYTES;
// s2 += SCMEMCMP_BYTES;
// } while (len > offset);
//
// return 0;
//}
//
//#define UPPER_LOW 0x40 /* "A" - 1 */
//#define UPPER_HIGH 0x5B /* "Z" + 1 */
//#define UPPER_DELTA 0xDF /* 0xFF - 0x20 */
//
//static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
// size_t offset = 0;
// __m128i b1, b2, mask1, mask2, upper1, upper2, delta;
//
// /* setup registers for upper to lower conversion */
// upper1 = _mm_set1_epi8(UPPER_LOW);
// upper2 = _mm_set1_epi8(UPPER_HIGH);
// delta = _mm_set1_epi8(UPPER_DELTA);
//
// do {
// /* unaligned loading of the bytes to compare */
// b1 = _mm_loadu_si128((const __m128i *) s1);
// b2 = _mm_loadu_si128((const __m128i *) s2);
//
// /* mark all chars bigger than upper1 */
// mask1 = _mm_cmpgt_epi8(b2, upper1);
// /* mark all chars lower than upper2 */
// mask2 = _mm_cmplt_epi8(b2, upper2);
// /* merge the two, leaving only those that are true in both */
// mask1 = _mm_cmpeq_epi8(mask1, mask2);
//
// /* sub delta leaves 0x20 only for uppercase positions, the
// rest is 0x00 due to the saturation (reuse mask1 reg)*/
// mask1 = _mm_subs_epu8(mask1, delta);
//
// /* add to b2, converting uppercase to lowercase */
// b2 = _mm_add_epi8(b2, mask1);
//
// /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
// mask1 = _mm_cmpeq_epi8(b1, b2);
//
// int diff = len - offset;
// if (diff < 16) {
// int rmask = ~(0xFFFFFFFF << diff);
//
// if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
// return 1;
// }
// } else {
// if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
// return 1;
// }
// }
//
// offset += SCMEMCMP_BYTES;
// s1 += SCMEMCMP_BYTES;
// s2 += SCMEMCMP_BYTES;
// } while (len > offset);
//
// return 0;
//}
//
#elif defined(__SSE3__)
#include <pmmintrin.h> /* for SSE3 */
#define SCMEMCMP_BYTES 16
static inline int SCMemcmp(void *s1, void *s2, size_t len) {
size_t offset = 0;
__m128i b1, b2, c;
do {
/* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
* _mm_lddqu_si128 was about 2% slower even though it's supposed to
* be faster. */
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
c = _mm_cmpeq_epi8(b1, b2);
int diff = len - offset;
if (diff < 16) {
int rmask = ~(0xFFFFFFFF << diff);
if ((_mm_movemask_epi8(c) & rmask) != rmask) {
return 1;
}
} else {
if (_mm_movemask_epi8(c) != 0x0000FFFF) {
return 1;
}
}
offset += SCMEMCMP_BYTES;
s1 += SCMEMCMP_BYTES;
s2 += SCMEMCMP_BYTES;
} while (len > offset);
return 0;
}
#define UPPER_LOW 0x40 /* "A" - 1 */
#define UPPER_HIGH 0x5B /* "Z" + 1 */
#define UPPER_DELTA 0xDF /* 0xFF - 0x20 */
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
size_t offset = 0;
__m128i b1, b2, mask1, mask2, upper1, upper2, delta;
/* setup registers for upper to lower conversion */
upper1 = _mm_set1_epi8(UPPER_LOW);
upper2 = _mm_set1_epi8(UPPER_HIGH);
delta = _mm_set1_epi8(UPPER_DELTA);
do {
/* unaligned loading of the bytes to compare */
b1 = _mm_loadu_si128((const __m128i *) s1);
b2 = _mm_loadu_si128((const __m128i *) s2);
/* mark all chars bigger than upper1 */
mask1 = _mm_cmpgt_epi8(b2, upper1);
/* mark all chars lower than upper2 */
mask2 = _mm_cmplt_epi8(b2, upper2);
/* merge the two, leaving only those that are true in both */
mask1 = _mm_cmpeq_epi8(mask1, mask2);
/* sub delta leaves 0x20 only for uppercase positions, the
rest is 0x00 due to the saturation (reuse mask1 reg)*/
mask1 = _mm_subs_epu8(mask1, delta);
/* add to b2, converting uppercase to lowercase */
b2 = _mm_add_epi8(b2, mask1);
/* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
mask1 = _mm_cmpeq_epi8(b1, b2);
int diff = len - offset;
if (diff < 16) {
int rmask = ~(0xFFFFFFFF << diff);
if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
return 1;
}
} else {
if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
return 1;
}
}
offset += SCMEMCMP_BYTES;
s1 += SCMEMCMP_BYTES;
s2 += SCMEMCMP_BYTES;
} while (len > offset);
return 0;
}
#else
/* No SIMD support, fall back to plain memcmp and a home grown lowercase one */

@ -54,6 +54,7 @@
#include "conf.h"
#include "util-debug.h"
#include "util-unittest.h"
#include "util-memcmp.h"
void SCACInitCtx(MpmCtx *, int);
void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t);
@ -1206,7 +1207,7 @@ uint32_t SCACSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
uint32_t *pids = ctx->output_table[state].pids;
for (k = 0; k < no_of_entries; k++) {
if (pids[k] & 0xFFFF0000) {
if (memcmp(ctx->pid_pat_list[pids[k] & 0x0000FFFF].cs,
if (SCMemcmp(ctx->pid_pat_list[pids[k] & 0x0000FFFF].cs,
buf + i - ctx->pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1,
ctx->pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) {
/* inside loop */
@ -1237,7 +1238,7 @@ uint32_t SCACSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
uint32_t k;
for (k = 0; k < no_of_entries; k++) {
if (pids[k] & 0xFFFF0000) {
if (memcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs,
if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs,
buf + i - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1,
pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) {
/* inside loop */

Loading…
Cancel
Save