suricata/src/util-spm-bm.c

/* Copyright (C) 2007-2010 Open Information Security Foundation
 *
 * You can copy, redistribute or modify this Program under the terms of
 * the GNU General Public License version 2 as published by the Free
 * Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * version 2 along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 */

/**
 * \file
 *
 * \author Pablo Rincon Crespo <pablo.rincon.crespo@gmail.com>
 *
 * Boyer Moore simple pattern matcher implementation
 *
 * Boyer Moore algorithm has a really good performance. It need to arrays
 * of context for each pattern that hold applicable shifts on the text
 * to seach in, based on characters not available in the pattern
 * and combinations of characters that start a sufix on the pattern.
 * If possible, we should store the context of patterns that we are going
 * to search for multiple times, so we don't spend time on rebuilding them.
 */

#include "suricata-common.h"
#include "suricata.h"

#include "util-spm-bm.h"
#include "util-debug.h"
#include "util-error.h"

/**
 * \brief Given a BmCtx structure, recreate the pre/suffixes for
 *        nocase
 *
 * \retval BmCtx pointer to the already created BmCtx (with BoyerMooreCtxInit())
 * \param str pointer to the pattern string
 * \param size length of the string
 */
void BoyerMooreCtxToNocase(BmCtx *bm_ctx, uint8_t *needle, uint16_t needle_len) {

    /* Prepare bad chars with nocase chars */
    PreBmBcNocase(needle, needle_len, bm_ctx->bmBc);

    /* Prepare good Suffixes with nocase chars */
    PreBmGsNocase(needle, needle_len, bm_ctx->bmGs);
}

/**
 * \brief Setup a Booyer More context.
 *
 * \param str pointer to the pattern string
 * \param size length of the string
 * \retval BmCtx pointer to the newly created Context for the pattern
 * \initonly BoyerMoore contexts should be created at init
 */
BmCtx *BoyerMooreCtxInit(uint8_t *needle, uint16_t needle_len) {
    BmCtx *new = SCMalloc(sizeof(BmCtx));
    if (unlikely(new == NULL)) {
        SCLogError(SC_ERR_FATAL, "Fatal error encountered in BoyerMooreCtxInit. Exiting...");
        exit(EXIT_FAILURE);
    }

    /* Prepare bad chars */
    PreBmBc(needle, needle_len, new->bmBc);

    new->bmGs = SCMalloc(sizeof(uint16_t) * (needle_len + 1));
    if (new->bmGs == NULL) {
        exit(EXIT_FAILURE);
    }

    /* Prepare good Suffixes */
    if (PreBmGs(needle, needle_len, new->bmGs) == -1) {
        SCLogError(SC_ERR_FATAL, "Fatal error encountered in BooyerMooreCtxInit. Exiting...");
        exit(EXIT_FAILURE);
    }


    return new;
}

/**
 * \brief Free the memory allocated to Booyer More context.
 *
 * \param bmCtx pointer to the Context for the pattern
 */
void BoyerMooreCtxDeInit(BmCtx *bmctx)
{
    SCEnter();
    if (bmctx == NULL)
        SCReturn;

    if (bmctx->bmGs != NULL)
        SCFree(bmctx->bmGs);

    SCFree(bmctx);

    SCReturn;
}
/**
 * \brief Array setup function for bad characters that split the pattern
 *        Remember that the result array should be the length of ALPHABET_SIZE
 *
 * \param str pointer to the pattern string
 * \param size length of the string
 * \param result pointer to an empty array that will hold the badchars
 */
void PreBmBc(const uint8_t *x, uint16_t m, uint16_t *bmBc) {
    int32_t i;

    for (i = 0; i < 256; ++i) {
        bmBc[i] = m;
    }
    for (i = 0; i < m - 1; ++i) {
        bmBc[(unsigned char)x[i]] = m - i - 1;
    }
}

/**
 * \brief Array setup function for building prefixes (shift for valid prefixes) for boyermoore context
 *
 * \param x pointer to the pattern string
 * \param m length of the string
 * \param suff pointer to an empty array that will hold the prefixes (shifts)
 */
void BoyerMooreSuffixes(const uint8_t *x, uint16_t m, uint16_t *suff) {
    int32_t f = 0, g, i;
    suff[m - 1] = m;
    g = m - 1;
    for (i = m - 2; i >= 0; --i) {
        if (i > g && suff[i + m - 1 - f] < i - g)
            suff[i] = suff[i + m - 1 - f];
        else {
            if (i < g)
                g = i;
            f = i;
            while (g >= 0 && x[g] == x[g + m - 1 - f])
                --g;
            suff[i] = f - g;
        }
    }
}

/**
 * \brief Array setup function for building prefixes (shift for valid prefixes) for boyermoore context
 *
 * \param x pointer to the pattern string
 * \param m length of the string
 * \param bmGs pointer to an empty array that will hold the prefixes (shifts)
 * \retval 0 ok, -1 failed
 */
int PreBmGs(const uint8_t *x, uint16_t m, uint16_t *bmGs) {
    int32_t i, j;
    uint16_t *suff;

    suff = SCMalloc(sizeof(uint16_t) * (m + 1));
    if (unlikely(suff == NULL))
        return -1;

    BoyerMooreSuffixes(x, m, suff);

    for (i = 0; i < m; ++i)
        bmGs[i] = m;

    j = 0;

    for (i = m - 1; i >= -1; --i)
        if (i == -1 || suff[i] == i + 1)
            for (; j < m - 1 - i; ++j)
                if (bmGs[j] == m)
                    bmGs[j] = m - 1 - i;

    for (i = 0; i <= m - 2; ++i)
        bmGs[m - 1 - suff[i]] = m - 1 - i;
    SCFree(suff);
    return 0;
}

/**
 * \brief Array setup function for bad characters that split the pattern
 *        Remember that the result array should be the length of ALPHABET_SIZE
 *
 * \param str pointer to the pattern string
 * \param size length of the string
 * \param result pointer to an empty array that will hold the badchars
 */
void PreBmBcNocase(const uint8_t *x, uint16_t m, uint16_t *bmBc) {
    int32_t i;

    for (i = 0; i < 256; ++i) {
        bmBc[i] = m;
    }
    for (i = 0; i < m - 1; ++i) {
        bmBc[u8_tolower((unsigned char)x[i])] = m - 1 - i;
    }
}

void BoyerMooreSuffixesNocase(const uint8_t *x, uint16_t m, uint16_t *suff) {
    int32_t f = 0, g, i;

    suff[m - 1] = m;
    g = m - 1;
    for (i = m - 2; i >= 0; --i) {
        if (i > g && suff[i + m - 1 - f] < i - g) {
            suff[i] = suff[i + m - 1 - f];
        } else {
            if (i < g) {
                g = i;
            }
            f = i;
            while (g >= 0 && u8_tolower(x[g]) == u8_tolower(x[g + m - 1 - f])) {
                --g;
            }
            suff[i] = f - g;
        }
    }
}

/**
 * \brief Array setup function for building prefixes (shift for valid prefixes)
 *        for boyermoore context case less
 *
 * \param x pointer to the pattern string
 * \param m length of the string
 * \param bmGs pointer to an empty array that will hold the prefixes (shifts)
 */
void PreBmGsNocase(const uint8_t *x, uint16_t m, uint16_t *bmGs) {
    int32_t i, j;
    uint16_t* suff;

    suff = SCMalloc(sizeof(uint16_t) * (m + 1));
    if (unlikely(suff == NULL))
        return;

    BoyerMooreSuffixesNocase(x, m, suff);

    for (i = 0; i < m; ++i) {
        bmGs[i] = m;
    }
    j = 0;
    for (i = m - 1; i >= 0; --i) {
        if (i == -1 || suff[i] == i + 1) {
            for (; j < m - 1 - i; ++j) {
                if (bmGs[j] == m) {
                    bmGs[j] = m - 1 - i;
                }
            }
        }
    }
    for (i = 0; i <= m - 2; ++i) {
        bmGs[m - 1 - suff[i]] = m - 1 - i;
    }

    SCFree(suff);
}

/**
 * \brief Boyer Moore search algorithm
 *        Is better as the pattern length increases and for big buffers to search in.
 *        The algorithm needs a context of two arrays already prepared
 *        by prep_bad_chars() and prep_good_suffix()
 *
 * \param y pointer to the buffer to search in
 * \param n length limit of the buffer
 * \param x pointer to the pattern we ar searching for
 * \param m length limit of the needle
 * \param bmBc pointer to an array of BoyerMooreSuffixes prepared by prep_good_suffix()
 * \param bmGs pointer to an array of bachars prepared by prep_bad_chars()
 *
 * \retval ptr to start of the match; NULL if no match
 */
uint8_t *BoyerMoore(uint8_t *x, uint16_t m, uint8_t *y, int32_t n, uint16_t *bmGs, uint16_t *bmBc) {
   int i, j, m1, m2;
#if 0
    printf("\nBad:\n");
    for (i=0;i<ALPHABET_SIZE;i++)
        printf("%c,%d ", i, bmBc[i]);

    printf("\ngood:\n");
    for (i=0;i<m;i++)
        printf("%c, %d ", x[i],bmBc[i]);
    printf("\n");
#endif
   j = 0;

   while (j <= n - m ) {
      for (i = m - 1; i >= 0 && x[i] == y[i + j]; --i);

      if (i < 0) {
         return y + j;
         //j += bmGs[0];
      } else {
 //        printf("%c", y[i+j]);
         j += (m1 = bmGs[i]) > (m2 = bmBc[y[i + j]] - m + 1 + i)? m1: m2;
//            printf("%d, %d\n", m1, m2);
      }
   }
   return NULL;
}


/**
 * \brief Boyer Moore search algorithm
 *        Is better as the pattern length increases and for big buffers to search in.
 *        The algorithm needs a context of two arrays already prepared
 *        by prep_bad_chars() and prep_good_suffix()
 *
 * \param y pointer to the buffer to search in
 * \param n length limit of the buffer
 * \param x pointer to the pattern we ar searching for
 * \param m length limit of the needle
 * \param bmBc pointer to an array of BoyerMooreSuffixes prepared by prep_good_suffix()
 * \param bmGs pointer to an array of bachars prepared by prep_bad_chars()
 *
 * \retval ptr to start of the match; NULL if no match
 */
uint8_t *BoyerMooreNocase(uint8_t *x, uint16_t m, uint8_t *y, int32_t n, uint16_t *bmGs, uint16_t *bmBc) {
    int i, j, m1, m2;
#if 0
    printf("\nBad:\n");
    for (i=0;i<ALPHABET_SIZE;i++)
        printf("%c,%d ", i, bmBc[i]);

    printf("\ngood:\n");
    for (i=0;i<m;i++)
        printf("%c, %d ", x[i],bmBc[i]);
    printf("\n");
#endif
    j = 0;
    while (j <= n - m ) {
        for (i = m - 1; i >= 0 && u8_tolower(x[i]) == u8_tolower(y[i + j]); --i);

        if (i < 0) {
            return y + j;
        } else {
            j += (m1=bmGs[i]) > (m2=bmBc[u8_tolower(y[i + j])] - m + 1 + i)?m1:m2;
        }
   }
   return NULL;
}