dpdk: rework hugepage hints to use per-numa information

Previous integration of hugepage analysis only fetched data
from /proc/meminfo. However this proved to be often
deceiving mainly for providing only global information and
not taking into account different hugepage sizes (e.g. 1GB
hugepages) and different NUMA nodes.

Ticket: #6419
pull/10204/head
Lukas Sismis 2 years ago committed by Victor Julien
parent c28cc93e23
commit ca6f7c2d00

@ -15,6 +15,57 @@ learn more about the basic setup for DPDK.
The following sections contain examples of how to set up DPDK and Suricata for
more obscure use-cases.
Hugepage analysis
-----------------
Suricata can analyse utilized hugepages on the system. This can be particularly
beneficial when there's a potential overallocation of hugepages.
The hugepage analysis is designed to examine the hugepages in use and
provide recommendations on an adequate number of hugepages. This then ensures
Suricata operates optimally while leaving sufficient memory for other
applications on the system. The analysis works by comparing snapshots of the
hugepages before and after Suricata is initialized. After the initialization,
no more hugepages are allocated by Suricata.
The hugepage analysis can be seen in the Perf log level and is printed out
during the Suricata start. It is only printed when Suricata detects some
disrepancies in the system related to hugepage allocation.
It's recommended to perform this analysis from a "clean" state -
that is a state when all your hugepages are free. It is especially recommended
when no other hugepage-dependent applications are running on your system.
This can be checked in one of two ways:
.. code-block::
# global check
cat /proc/meminfo
HugePages_Total: 1024
HugePages_Free: 1024
# per-numa check depends on NUMA node ID, hugepage size,
# and nr_hugepages/free_hugepages - e.g.:
cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages
After the termination of Suricata and other hugepage-related applications,
if the count of free hugepages is not equal with the total number of hugepages,
it indicates some hugepages were not freed completely.
This can be fixed by removing DPDK-related files from the hugepage-mounted
directory (filesystem).
It's important to exercise caution while removing hugepages, especially when
other hugepage-dependent applications are in operation, as this action will
disrupt their memory functionality.
Removing the DPDK files from the hugepage directory can often be done as:
.. code-block:: bash
sudo rm -rf /dev/hugepages/rtemap_*
# To check where hugepages are mounted:
dpdk-hugepages.py -s
# or
mount | grep huge
Bond interface
--------------

@ -555,6 +555,7 @@ noinst_HEADERS = \
util-hash-string.h \
util-host-info.h \
util-host-os-info.h \
util-hugepages.h \
util-hyperscan.h \
util-ioctl.h \
util-ip.h \
@ -1157,6 +1158,7 @@ libsuricata_c_a_SOURCES = \
util-hash-string.c \
util-host-info.c \
util-host-os-info.c \
util-hugepages.c \
util-hyperscan.c \
util-ioctl.c \
util-ip.c \

@ -127,6 +127,7 @@
#include "util-ebpf.h"
#include "util-exception-policy.h"
#include "util-host-os-info.h"
#include "util-hugepages.h"
#include "util-ioctl.h"
#include "util-landlock.h"
#include "util-luajit.h"
@ -2973,6 +2974,7 @@ int SuricataMain(int argc, char **argv)
goto out;
}
SystemHugepageSnapshot *prerun_snap = SystemHugepageSnapshotCreate();
SCSetStartTime(&suricata);
RunModeDispatch(suricata.run_mode, suricata.runmode_custom_mode,
suricata.capture_plugin_name, suricata.capture_plugin_args);
@ -3031,7 +3033,11 @@ int SuricataMain(int argc, char **argv)
PostRunStartedDetectSetup(&suricata);
DPDKEvaluateHugepages();
SystemHugepageSnapshot *postrun_snap = SystemHugepageSnapshotCreate();
if (run_mode == RUNMODE_DPDK) // only DPDK uses hpages at the moment
SystemHugepageEvaluateHugepages(prerun_snap, postrun_snap);
SystemHugepageSnapshotDestroy(prerun_snap);
SystemHugepageSnapshotDestroy(postrun_snap);
SCPledge();
SuricataMainLoop(&suricata);

@ -66,106 +66,7 @@ void DPDKFreeDevice(LiveDevice *ldev)
#endif
}
static FILE *HugepagesMeminfoOpen(void)
{
FILE *fp = fopen("/proc/meminfo", "r");
if (fp == NULL) {
SCLogInfo("Can't analyze hugepage usage: failed to open /proc/meminfo");
}
return fp;
}
static void HugepagesMeminfoClose(FILE *fp)
{
if (fp) {
fclose(fp);
}
}
/**
* Parsing values of meminfo
*
* \param fp Opened file pointer for reading of file /proc/meminfo at beginning
* \param keyword Entry to look for e.g. "HugePages_Free:"
* \return n Value of the entry
* \return -1 On error
*
*/
static int32_t MemInfoParseValue(FILE *fp, const char *keyword)
{
char path[256], value_str[64];
int32_t value = -1;
while (fscanf(fp, "%255s", path) != EOF) {
if (strcmp(path, keyword) == 0) {
if (fscanf(fp, "%63s", value_str) == EOF) {
SCLogDebug("%s: not followed by any number", keyword);
break;
}
if (StringParseInt32(&value, 10, 23, value_str) < 0) {
SCLogDebug("Failed to convert %s from /proc/meminfo", keyword);
value = -1;
}
break;
}
}
return value;
}
static void MemInfoEvaluateHugepages(FILE *fp)
{
int32_t free_hugepages = MemInfoParseValue(fp, "HugePages_Free:");
if (free_hugepages < 0) {
SCLogInfo("HugePages_Free information not found in /proc/meminfo");
return;
}
rewind(fp);
int32_t total_hugepages = MemInfoParseValue(fp, "HugePages_Total:");
if (total_hugepages < 0) {
SCLogInfo("HugePages_Total information not found in /proc/meminfo");
return;
} else if (total_hugepages == 0) {
SCLogInfo("HugePages_Total equals to zero");
return;
}
float free_hugepages_ratio = (float)free_hugepages / (float)total_hugepages;
if (free_hugepages_ratio > 0.5) {
SCLogInfo("%" PRIu32 " of %" PRIu32
" of hugepages are free - number of hugepages can be lowered to e.g. %.0lf",
free_hugepages, total_hugepages, ceil((total_hugepages - free_hugepages) * 1.15));
}
}
static void MemInfoWith(void (*callback)(FILE *))
{
FILE *fp = HugepagesMeminfoOpen();
if (fp) {
callback(fp);
HugepagesMeminfoClose(fp);
}
}
void DPDKEvaluateHugepages(void)
{
if (run_mode != RUNMODE_DPDK)
return;
#ifdef HAVE_DPDK
if (rte_eal_has_hugepages() == 0) { // hugepages disabled
SCLogPerf("Hugepages not enabled - enabling hugepages can improve performance");
return;
}
#endif
MemInfoWith(MemInfoEvaluateHugepages);
}
#ifdef HAVE_DPDK
/**
* Retrieves name of the port from port id
* Not thread-safe

@ -121,7 +121,6 @@ void DPDKCleanupEAL(void);
void DPDKCloseDevice(LiveDevice *ldev);
void DPDKFreeDevice(LiveDevice *ldev);
void DPDKEvaluateHugepages(void);
#ifdef HAVE_DPDK
const char *DPDKGetPortNameByPortID(uint16_t pid);

@ -0,0 +1,411 @@
/* Copyright (C) 2023 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Lukas Sismis <lsismis@oisf.net>
*/
#include "suricata.h"
#include "util-debug.h"
#include "util-hugepages.h"
static uint16_t SystemHugepageSizesCntPerNodeGet(uint16_t node_index);
static uint16_t SystemNodeCountGet(void);
static void SystemHugepagePerNodeGetHugepageSizes(
uint16_t node_index, uint16_t hp_sizes_cnt, uint32_t *hp_sizes);
static HugepageInfo *SystemHugepageHugepageInfoCreate(uint16_t hp_size_cnt);
static int16_t SystemHugepagePerNodeGetHugepageInfo(uint16_t node_index, NodeInfo *node);
static void SystemHugepageHugepageInfoDestroy(HugepageInfo *h);
static void SystemHugepageNodeInfoDestroy(NodeInfo *n);
static void SystemHugepageNodeInfoDump(NodeInfo *n);
static void SystemHugepageSnapshotDump(SystemHugepageSnapshot *s);
static bool SystemHugepageSupported(void)
{
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
return true;
#else
return false;
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
}
// block of all hugepage-specific internal functions
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
/**
* \brief Linux-specific function to detect number of NUMA nodes on the system
* \returns number of NUMA nodes, 0 on error
*/
static uint16_t SystemNodeCountGetLinux(void)
{
char dir_path[] = "/sys/devices/system/node/";
DIR *dir = opendir(dir_path);
if (dir == NULL) {
SCLogError("unable to open %s", dir_path);
return 0;
}
uint16_t count = 0;
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
char d_name[] = "node";
if (entry->d_type == DT_DIR && strncmp(entry->d_name, d_name, strlen(d_name)) == 0)
count++;
}
closedir(dir);
return count;
}
/**
* \brief Linux-specific function to detect number of unique hugepage sizes
* \param[in] node_index index of the NUMA node
* \returns number of hugepage sizes, 0 on error
*/
static uint16_t SystemHugepageSizesCntPerNodeGetLinux(uint16_t node_index)
{
char dir_path[256];
snprintf(dir_path, sizeof(dir_path), "/sys/devices/system/node/node%d/hugepages/", node_index);
DIR *dir = opendir(dir_path);
if (dir == NULL) {
SCLogError("unable to open %s", dir_path);
return 0;
}
uint16_t count = 0;
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
char d_name[] = "hugepages-";
if (entry->d_type == DT_DIR && strncmp(entry->d_name, d_name, strlen(d_name)) == 0)
count++;
}
closedir(dir);
return count;
}
/**
* \brief Linux-specific function to detect unique hugepage sizes
* \note Arrays `hugepages` and `hp_sizes` are expected to have the same size
* \param[in] node_index index of the NUMA node
* \param[in] hp_sizes_cnt number of the unique hugepage sizes
* \param[out] hp_sizes a pointer to the array of hugepage sizes
*/
static void SystemHugepagePerNodeGetHugepageSizesLinux(
uint16_t node_index, uint16_t hp_sizes_cnt, uint32_t *hp_sizes)
{
char dir_path[256];
snprintf(dir_path, sizeof(dir_path), "/sys/devices/system/node/node%d/hugepages/", node_index);
DIR *dir = opendir(dir_path);
if (dir == NULL) {
SCLogError("unable to open %s", dir_path);
return;
}
uint16_t index = 0;
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR && strncmp(entry->d_name, "hugepages-", 10) == 0) {
sscanf(entry->d_name, "hugepages-%ukB", &(hp_sizes[index]));
index++;
}
}
closedir(dir);
}
/**
* \brief Linux-specific function to detect number of unique hugepage sizes
* \note Arrays `hugepages` and `hp_sizes` are expected to have the same size
* \param[out] hugepages a pointer to the array of hugepage info structures
* \param[in] hp_sizes a pointer to the array of hugepage sizes
* \param[in] hp_sizes_cnt number of hugepage sizes
* \param[in] node_index index of the NUMA node
* \returns 0 on success, negative number on error
*/
static int16_t SystemHugepagePerNodeGetHugepageInfoLinux(
HugepageInfo *hugepages, uint32_t *hp_sizes, uint16_t hp_sizes_cnt, uint16_t node_index)
{
for (int16_t i = 0; i < hp_sizes_cnt; i++) {
hugepages[i].size_kb = hp_sizes[i];
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/node/node%hu/hugepages/hugepages-%ukB/nr_hugepages",
node_index, hp_sizes[i]);
FILE *f = fopen(path, "r");
if (!f) {
SCLogError("unable to open %s", path);
return -SC_EEXIST;
}
if (fscanf(f, "%hu", &hugepages[i].allocated) != 1) {
SCLogError("failed to read the total number of allocated hugepages (%ukB) on node %hu",
hp_sizes[i], node_index);
fclose(f);
return -SC_EINVAL;
}
fclose(f);
snprintf(path, sizeof(path),
"/sys/devices/system/node/node%hu/hugepages/hugepages-%ukB/free_hugepages",
node_index, hp_sizes[i]);
f = fopen(path, "r");
if (!f) {
SCLogError("unable to open %s", path);
return -SC_EEXIST;
}
if (fscanf(f, "%hu", &hugepages[i].free) != 1) {
SCLogError("failed to read the total number of free hugepages (%ukB) on node %hu",
hp_sizes[i], node_index);
fclose(f);
return -SC_EINVAL;
}
fclose(f);
}
return 0;
}
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
/**
* \brief The function gathers information about hugepages on a given node
* \param[in] node_index index of the NUMA node
* \param[out] node a pointer to the structure to hold hugepage info
* \returns 0 on success, negative number on error
*/
static int16_t SystemHugepagePerNodeGetHugepageInfo(uint16_t node_index, NodeInfo *node)
{
uint16_t hp_sizes_cnt = SystemHugepageSizesCntPerNodeGet(node_index);
if (hp_sizes_cnt == 0) {
SCLogError("hugepages not found for node %d", node_index);
return -SC_EEXIST;
}
uint32_t *hp_sizes = SCCalloc(hp_sizes_cnt, sizeof(*hp_sizes));
if (hp_sizes == NULL) {
FatalError("failed to allocate memory for hugepage info");
}
SystemHugepagePerNodeGetHugepageSizes(node_index, hp_sizes_cnt, hp_sizes);
node->hugepages = SystemHugepageHugepageInfoCreate(hp_sizes_cnt);
node->num_hugepage_sizes = hp_sizes_cnt;
int16_t ret = 0;
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
ret = SystemHugepagePerNodeGetHugepageInfoLinux(
node->hugepages, hp_sizes, node->num_hugepage_sizes, node_index);
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
SCFree(hp_sizes);
return ret;
}
/**
* \brief The function detects number of NUMA nodes on the system
* \returns 0 if detection is unsuccessful, otherwise number of detected nodes
*/
static uint16_t SystemNodeCountGet(void)
{
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
return SystemNodeCountGetLinux();
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
return 0;
}
/**
* \brief The function detects the number of unique hugepage sizes
* \returns 0 if detection is unsuccessful, otherwise number of hugepage sizes
*/
static uint16_t SystemHugepageSizesCntPerNodeGet(uint16_t node_index)
{
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
return SystemHugepageSizesCntPerNodeGetLinux(node_index);
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
return 0;
}
/**
* \brief The function fills an array with unique hugepage sizes
* \note Arrays `hugepages` and `hp_sizes` are expected to have the same size
* \param[in] node_index index of the NUMA node
* \param[in] hp_sizes_cnt number of hugepage sizes
* \param[out] hp_sizes a pointer to the array of hugepage sizes
*/
static void SystemHugepagePerNodeGetHugepageSizes(
uint16_t node_index, uint16_t hp_sizes_cnt, uint32_t *hp_sizes)
{
#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun
return SystemHugepagePerNodeGetHugepageSizesLinux(node_index, hp_sizes_cnt, hp_sizes);
#endif /* !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun */
}
static HugepageInfo *SystemHugepageHugepageInfoCreate(uint16_t hp_size_cnt)
{
HugepageInfo *h = SCCalloc(hp_size_cnt, sizeof(*h));
if (h == NULL) {
FatalError("failed to allocate hugepage info array");
}
return h;
}
static void SystemHugepageHugepageInfoDestroy(HugepageInfo *h)
{
if (h != NULL)
SCFree(h);
}
static void SystemHugepageNodeInfoDestroy(NodeInfo *n)
{
if (n == NULL)
return;
SystemHugepageHugepageInfoDestroy(n->hugepages);
}
static void SystemHugepageNodeInfoDump(NodeInfo *n)
{
if (n == NULL)
return;
for (uint16_t i = 0; i < n->num_hugepage_sizes; i++) {
SCLogDebug("Hugepage size - %dkB - allocated: %d free: %d", n->hugepages[i].size_kb,
n->hugepages[i].allocated, n->hugepages[i].free);
}
}
/**
* \brief The function prints out the hugepage snapshot
* \param[in] s a pointer to the snapshot
*/
static void SystemHugepageSnapshotDump(SystemHugepageSnapshot *s)
{
if (s == NULL)
return;
for (uint16_t i = 0; i < s->num_nodes; i++) {
SCLogDebug("NUMA Node %d", i);
SystemHugepageNodeInfoDump(&(s->nodes[i]));
}
}
void SystemHugepageSnapshotDestroy(SystemHugepageSnapshot *s)
{
if (s == NULL)
return;
for (uint16_t i = 0; i < s->num_nodes; i++) {
SystemHugepageNodeInfoDestroy(&(s->nodes[i]));
}
SCFree(s->nodes);
SCFree(s);
}
/**
* \brief The function creates a snapshot of the system's hugepage usage
* per NUMA node and per hugepage size.
* The snapshot is used to evaluate the system's hugepage usage after
* initialization of Suricata.
* \returns a pointer to the snapshot, NULL on error
*/
SystemHugepageSnapshot *SystemHugepageSnapshotCreate(void)
{
if (!SystemHugepageSupported())
return NULL;
uint16_t node_cnt = SystemNodeCountGet();
if (node_cnt == 0) {
SCLogError("failed to obtain number of NUMA nodes in the system");
return NULL;
}
NodeInfo *nodes = SCCalloc(node_cnt, sizeof(*nodes));
if (nodes == NULL) {
FatalError("failed to allocate memory for NUMA node info");
}
SystemHugepageSnapshot *s = SCCalloc(1, sizeof(*s));
if (s == NULL) {
SCFree(nodes);
FatalError("failed to allocate memory for NUMA node snapshot");
}
s->num_nodes = node_cnt;
s->nodes = nodes;
for (uint16_t i = 0; i < s->num_nodes; i++) {
int16_t ret = SystemHugepagePerNodeGetHugepageInfo(i, &s->nodes[i]);
if (ret != 0) {
SystemHugepageSnapshotDestroy(s);
return NULL;
}
}
return s;
}
/**
* \brief The function compares two hugepage snapshots and prints out
* recommendations for hugepage configuration
* \param[in] pre_s a pointer to the snapshot taken before Suricata initialization
* \param[in] post_s a pointer to the snapshot taken after Suricata initialization
*/
void SystemHugepageEvaluateHugepages(SystemHugepageSnapshot *pre_s, SystemHugepageSnapshot *post_s)
{
if (!SystemHugepageSupported() || pre_s == NULL || post_s == NULL)
return;
SCLogDebug("Hugepages before initialization");
SystemHugepageSnapshotDump(pre_s);
SCLogDebug("Hugepages after initialization");
SystemHugepageSnapshotDump(post_s);
if (pre_s->num_nodes != post_s->num_nodes)
FatalError("Number of NUMA nodes changed during hugepage evaluation");
for (int32_t i = 0; i < post_s->num_nodes; i++) {
if (pre_s->nodes[i].num_hugepage_sizes != post_s->nodes[i].num_hugepage_sizes)
FatalError("Number of NUMA node hugepage sizes changed during hugepage evaluation");
for (int32_t j = 0; j < post_s->nodes->num_hugepage_sizes; j++) {
HugepageInfo *prerun_hp = &pre_s->nodes[i].hugepages[j];
HugepageInfo *postrun_hp = &post_s->nodes[i].hugepages[j];
if (prerun_hp->free == 0) {
continue; // this HP size on this node has no HPs allocated
} else if (prerun_hp->free < postrun_hp->free) {
SCLogWarning(
"Hugepage usage decreased while it should only increase/stay the same");
} else if (prerun_hp->free > 0 && prerun_hp->free == postrun_hp->free) {
SCLogPerf("Hugepages on NUMA node %u are unused and can be deallocated", i);
} else { // assumes this is an active NUMA node because at least some hugepages were
// used
// speculative hint only for 2048kB pages as e.g. 1 GB pages can leave a lot of room
// for additional allocations
if (postrun_hp->size_kb == 2048 && postrun_hp->free == 0) {
SCLogPerf("all %ukB hugepages used on NUMA node %d - consider increasing to "
"prevent memory allocation from other NUMA nodes",
postrun_hp->size_kb, i);
}
float free_hugepages_ratio = (float)postrun_hp->free / (float)prerun_hp->free;
if (free_hugepages_ratio > 0.5) {
int32_t used_hps = prerun_hp->free - postrun_hp->free;
SCLogPerf("Hugepages on NUMA node %u can be set to %.0lf (only using %u/%u "
"%ukB hugepages)",
i, ceil((prerun_hp->free - postrun_hp->free) * 1.15), used_hps,
prerun_hp->free, postrun_hp->size_kb);
}
}
}
}
}

@ -0,0 +1,53 @@
/* Copyright (C) 2023 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Lukas Sismis <lsismis@oisf.net>
*/
#ifndef UTIL_HUGEPAGES_H
#define UTIL_HUGEPAGES_H
typedef struct {
uint32_t size_kb;
uint16_t allocated;
uint16_t free;
} HugepageInfo;
// Structure to hold information about individual NUMA nodes in the system and
// and their respective allocated hugepages
// So for e.g. NUMA node 0 there can be 2 hugepage_size - 2 MB and 1 GB
// Each hugepage size will then have a record of number of allocated/free hpages
typedef struct {
uint16_t num_hugepage_sizes;
HugepageInfo *hugepages;
} NodeInfo;
// Structure to hold information about all hugepage sizes residing on all NUMA
// nodes in the system
typedef struct {
uint16_t num_nodes;
NodeInfo *nodes;
} SystemHugepageSnapshot;
SystemHugepageSnapshot *SystemHugepageSnapshotCreate(void);
void SystemHugepageSnapshotDestroy(SystemHugepageSnapshot *s);
void SystemHugepageEvaluateHugepages(SystemHugepageSnapshot *pre_s, SystemHugepageSnapshot *post_s);
#endif /* UTIL_HUGEPAGES_H */
Loading…
Cancel
Save