1
0
mirror of https://github.com/systemd/systemd synced 2026-03-13 08:34:47 +01:00

Compare commits

..

No commits in common. "dd8c12e5ec2416838401350a75f1df24f3bcb6cf" and "5276ef1548d6cda3ba22270a7a153334c1815e33" have entirely different histories.

15 changed files with 119 additions and 426 deletions

View File

@ -25,7 +25,6 @@ Christophe Varoqui <christophe.varoqui@free.fr>
Colin Guthrie <ColinGuthrie@web>
Cristian Rodríguez <cristian@rodriguez.im> <crodriguez@owncloud.com>
Daan De Meyer <daan.j.demeyer@gmail.com>
Daan De Meyer <daan.j.demeyer@gmail.com> <daan@amutable.com>
Daniel Elstner <daniel.kitta@gmail.com> <danielk@openismus.com>
Daniel Gorbea <danielgorbea@hotmail.com>
Daniel J Walsh <dwalsh@redhat.com>
@ -111,7 +110,6 @@ Larry Bernstone <lbernstone@gmail.com>
Lennart Poettering <lennart@poettering.net> <LennartPoettering@web>
Lennart Poettering <lennart@poettering.net> <lennart@bf9bc1cc-28ce-0310-abfb-9041aa761afb>
Lennart Poettering <lennart@poettering.net> <mzninuv@0pointer.de>
Lennart Poettering <lennart@poettering.net> <lennart@amutable.com>
Leonard König <leonard.r.koenig@googlemail.com>
Luca BRUNO <luca.bruno@coreos.com>
Luis Felipe Strano Moraes <luis.strano@gmail.com> <lfelipe@profusion.mobi>

View File

@ -14,7 +14,7 @@ A few interfaces are optionally kept for backward compatibility.
When systemd is compiled with the `-Dcompat-sysv-interfaces=true` setting,
legacy interfaces are provided,
e.g. the `runlevelX.target` aliases,
and lock directories under `/var` and `/run`.
and lock directories in under `/var` and `/run`.
This option may be extended to cover other deprecated interfaces in the future.

View File

@ -48,26 +48,6 @@
<xi:include href="standard-conf.xml" xpointer="main-conf" />
<refsect1>
<title>Prekill event</title>
<para><command>systemd-oomd</command> supports notifying external components before killing a control
group.
This is done by sending a notification over varlink to all sockets found in
<filename>/run/systemd/oomd.prekill.hook/</filename> folder. Each socket should implement the
<constant>io.systemd.oom.Prekill</constant> interface. The notification contains the control group path
to allow the hook to identify which control group is being killed. This allows external components to
perform any necessary cleanup or logging before the control group is terminated. The hook is not intended
as a way to avoid the kill, but rather as a notification mechanism.
Note that this is a privileged option as, even if it has a timeout, is synchronous and delays the kill,
so use with care.
The typically preferable mechanism to process memory pressure is to do what
<ulink url="https://systemd.io/MEMORY_PRESSURE/">MEMORY_PRESSURE</ulink> describes which is unprivileged,
asynchronous and does not delay the kill.
</para>
</refsect1>
<refsect1>
<title>[OOM] Section Options</title>
@ -118,22 +98,6 @@
<xi:include href="version-info.xml" xpointer="v248"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>PrekillHookTimeoutSec=</varname></term>
<listitem><para>Sets the amount of time <command>systemd-oomd</command> will wait for pre-kill hooks
to complete, before proceeding with the control group termination. Pre-kill hooks work by placing
varlink socket to <filename>/run/systemd/oomd.prekill.hook/</filename> folder. Each socket should
implement interface for notification to work. <command>systemd-oomd</command> sends a notification
before killing a control group for each discovered socket. The timeout is intended to be global and
not per hook. If all hooks return earlier, the kill is performed as soon as possible. The timeout
must be at least 1s.
Defaults to 0, which means <command>systemd-oomd</command> will not wait and no notifications
will be sent.</para>
<xi:include href="version-info.xml" xpointer="v260"/></listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -66,8 +66,6 @@
#define VARLINK_PATH_MACHINED_USERDB "/run/systemd/userdb/io.systemd.Machine"
/* Path where systemd-machined listens to resolve.hook varlink queries */
#define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine"
/* Path where to connect to send varlink prekill events */
#define VARLINK_DIR_OOMD_PREKILL_HOOK "/run/systemd/oomd.prekill.hook/"
/* Recommended baseline - see README for details */
#define KERNEL_BASELINE_VERSION "5.14"

View File

@ -75,7 +75,6 @@ void manager_parse_config_file(Manager *m) {
{ "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &m->swap_used_limit_permyriad },
{ "OOM", "DefaultMemoryPressureLimit", config_parse_loadavg, 0, &m->default_mem_pressure_limit },
{ "OOM", "DefaultMemoryPressureDurationSec", config_parse_duration, 0, &m->default_mem_pressure_duration_usec },
{ "OOM", "PrekillHookTimeoutSec", config_parse_sec, 0, &m->prekill_timeout },
{}
};

View File

@ -105,7 +105,7 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
if (message.mode == MANAGED_OOM_AUTO) {
(void) oomd_cgroup_context_unref(hashmap_remove(monitor_hm, empty_to_root(message.path)));
(void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
continue;
}
@ -392,7 +392,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
OomdCGroupContext *selected = NULL;
_cleanup_free_ char *selected = NULL;
uint64_t threshold;
log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
@ -408,28 +408,29 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
r = oomd_select_by_swap_usage(candidates, threshold, &selected);
if (r < 0)
return log_error_errno(r, "Failed to select any cgroups based on swap: %m");
if (r == 0) {
log_debug("No cgroup candidates found for swap-based OOM action");
return 0;
}
r = oomd_cgroup_kill_mark(m, selected);
r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_error_errno(r, "Failed to select any cgroups based on swap: %m");
log_notice_errno(r, "Failed to kill any cgroups based on swap: %m");
else {
if (selected && r > 0) {
log_notice("Marked %s for killing due to memory used (%"PRIu64") / total (%"PRIu64") and "
log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
"swap used (%"PRIu64") / total (%"PRIu64") being more than "
PERMYRIAD_AS_PERCENT_FORMAT_STR,
selected->path,
selected,
m->system_context.mem_used, m->system_context.mem_total,
m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
/* send dbus signal */
(void) sd_bus_emit_signal(m->bus,
"/org/freedesktop/oom1",
"org.freedesktop.oom1.Manager",
"Killed",
"ss",
selected,
"memory-used");
}
return 0;
}
@ -499,7 +500,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
else if (r == 1 && !in_post_action_delay) {
OomdCGroupContext *t;
SET_FOREACH(t, targets) {
OomdCGroupContext *selected = NULL;
_cleanup_free_ char *selected = NULL;
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
@ -524,21 +525,14 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
else
clear_candidates = NULL;
r = oomd_select_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates,
/* prefix= */ t->path,
/* dry_run= */ m->dry_run,
&selected);
if (r < 0)
return log_error_errno(r, "Failed to select any cgroups based on swap, ignoring: %m");
if (r == 0) {
log_debug("No cgroup candidates found for memory pressure-based OOM action for %s", t->path);
return 0;
}
r = oomd_cgroup_kill_mark(m, selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_error_errno(r, "Failed to select any cgroups under %s based on pressure, ignoring: %m", t->path);
log_notice_errno(r, "Failed to kill any cgroups under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
* If r == 0 then it means there were not eligible candidates, the candidate cgroup
@ -547,12 +541,21 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
* pressure is still high. */
m->mem_pressure_post_action_delay_start = usec_now;
if (selected && r > 0) {
log_notice("Marked %s for killing due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected->path, t->path,
selected, t->path,
LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC));
/* send dbus signal */
(void) sd_bus_emit_signal(m->bus,
"/org/freedesktop/oom1",
"org.freedesktop.oom1.Manager",
"Killed",
"ss",
selected,
"memory-pressure");
}
return 0;
}
@ -650,8 +653,6 @@ Manager* manager_free(Manager *m) {
hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
set_free(m->kill_states);
return mfree(m);
}

View File

@ -3,7 +3,6 @@
#include "conf-parser-forward.h"
#include "shared-forward.h"
#include "oomd-conf.h"
#include "oomd-util.h"
/* Polling interval for monitoring stats */
@ -55,9 +54,6 @@ typedef struct Manager {
/* This varlink server object is used to manage systemd-oomd's varlink server which is used by user
* managers to report changes in ManagedOOM settings (oomd server - systemd client). */
sd_varlink_server *varlink_server;
usec_t prekill_timeout;
Set *kill_states; /* currently ongoing OomdKillState operations */
} Manager;
Manager* manager_free(Manager *m);

View File

@ -1,35 +1,23 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "alloc-util.h"
#include "constants.h"
#include "dirent-util.h"
#include "errno-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "log.h"
#include "memstream-util.h"
#include "oomd-manager.h"
#include "oomd-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "pidref.h"
#include "procfs-util.h"
#include "sd-bus.h"
#include "set.h"
#include "signal-util.h"
#include "sort-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "time-util.h"
#include "varlink-util.h"
typedef struct OomdKillState {
Manager *manager;
OomdCGroupContext *ctx;
/* This holds sd_varlink references */
Set *links;
} OomdKillState;
DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
oomd_cgroup_ctx_hash_ops,
@ -37,7 +25,7 @@ DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
path_hash_func,
path_compare,
OomdCGroupContext,
oomd_cgroup_context_unref);
oomd_cgroup_context_free);
static int log_kill(const PidRef *pid, int sig, void *userdata) {
log_debug("oomd attempting to kill " PID_FMT " with %s", pid->pid, signal_to_string(sig));
@ -74,7 +62,7 @@ static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t nu
return 0;
}
static OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
if (!ctx)
return NULL;
@ -82,8 +70,6 @@ static OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
return mfree(ctx);
}
DEFINE_TRIVIAL_REF_UNREF_FUNC(OomdCGroupContext, oomd_cgroup_context, oomd_cgroup_context_free);
int oomd_pressure_above(Hashmap *h, Set **ret) {
_cleanup_set_free_ Set *targets = NULL;
OomdCGroupContext *ctx;
@ -245,263 +231,65 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
return (int) k;
}
int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse) {
int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
_cleanup_set_free_ Set *pids_killed = NULL;
int r;
assert(ctx);
assert(path);
/* First try to send SIG0 recursively to ensure all child cgroups can be killed. */
if (recurse)
r = cg_kill_recursive(path, /* sig= */ 0, CGROUP_IGNORE_SELF,
/* killed_pids= */ NULL, /* log_kill= */ NULL, /* userdata= */ NULL);
else
r = cg_kill(path, /* sig= */ 0, CGROUP_IGNORE_SELF,
/* killed_pids= */ NULL, /* log_kill= */ NULL, /* userdata= */ NULL);
if (r < 0)
return log_debug_errno(r, "Failed to send SIG0 to processes in cgroup '%s': %m", path);
if (dry_run) {
_cleanup_free_ char *cg_path = NULL;
r = cg_get_path(path, /* suffix= */ NULL, &cg_path);
if (r < 0)
return r;
log_info("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse));
return 0;
}
pids_killed = set_new(NULL);
if (!pids_killed)
return -ENOMEM;
r = increment_oomd_xattr(ctx->path, "user.oomd_ooms", 1);
r = increment_oomd_xattr(path, "user.oomd_ooms", 1);
if (r < 0)
log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m");
if (recurse)
r = cg_kill_recursive(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
r = cg_kill_recursive(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
else
r = cg_kill(ctx->path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
r = cg_kill(path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL);
/* The cgroup could have been cleaned up after we have sent SIGKILL to all of the processes, but before
* we could do one last iteration of cgroup.procs to check. Or the service unit could have exited and
* was removed between picking candidates and coming into this function. In either case, let's log
* about it let the caller decide what to do once they know how many PIDs were killed. */
if (IN_SET(r, -ENOENT, -ENODEV))
log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", ctx->path);
log_debug_errno(r, "Error when sending SIGKILL to processes in cgroup path %s, ignoring: %m", path);
else if (r < 0)
return r;
if (set_isempty(pids_killed))
log_debug("Nothing killed when attempting to kill %s", ctx->path);
log_debug("Nothing killed when attempting to kill %s", path);
r = increment_oomd_xattr(ctx->path, "user.oomd_kill", set_size(pids_killed));
r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
if (r < 0)
log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
/* send dbus signal */
if (m)
(void) sd_bus_emit_signal(m->bus,
"/org/freedesktop/oom1",
"org.freedesktop.oom1.Manager",
"Killed",
"ss",
ctx,
"oom");
return !set_isempty(pids_killed);
}
static void oomd_kill_state_free(OomdKillState *ks) {
if (!ks)
return;
assert(ks->manager);
set_free(ks->links);
set_remove(ks->manager->kill_states, ks);
oomd_cgroup_context_unref(ks->ctx);
free(ks);
}
static int oomd_kill_state_compare(const OomdKillState *a, const OomdKillState *b) {
return path_compare(a->ctx->path, b->ctx->path);
}
static void oomd_kill_state_hash_func(const OomdKillState *ks, struct siphash *state) {
path_hash_func(ks->ctx->path, state);
}
DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
oomd_kill_state_hash_ops,
OomdKillState,
oomd_kill_state_hash_func,
oomd_kill_state_compare,
oomd_kill_state_free);
/* oomd_kill_state_remove() is called N+1 times where N is the number of prekill hooks found.
* The extra call is just after creating the kill state, so to have at least a call if no
* prekill hooks are found. Each call removes one link from the kill state, and when the set
* is empty, it performs the actual cgroup kill. */
static void oomd_kill_state_remove(OomdKillState *ks) {
int r;
assert(ks);
assert(ks->ctx);
if (!set_isempty(ks->links))
return;
r = oomd_cgroup_kill(ks->manager, ks->ctx, /* recurse= */ true);
if (r < 0)
log_debug_errno(r, "Failed to kill cgroup '%s', ignoring: %m", ks->ctx->path);
oomd_kill_state_free(ks);
}
static int prekill_callback(
sd_varlink *link,
sd_json_variant *parameters,
const char *error_id,
sd_varlink_reply_flags_t flags,
void *userdata) {
OomdKillState *ks = ASSERT_PTR(userdata);
assert(ks);
assert(ks->ctx);
if (error_id)
log_warning("oomd prekill hook for %s returned error: %s", ks->ctx->path, error_id);
else
log_info("oomd prekill hook finished for cgroup %s", ks->ctx->path);
assert_se(set_remove(ks->links, link) == link);
oomd_kill_state_remove(ks);
sd_varlink_unref(link);
return 0;
}
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OomdKillState *, oomd_kill_state_remove, NULL);
static int send_prekill_message(
const char *basename,
sd_json_variant *cparams,
OomdKillState *ks,
sd_event *e) {
_cleanup_(sd_varlink_close_unrefp) sd_varlink *link = NULL;
_cleanup_free_ char *hook_path = NULL;
int r;
assert(basename);
assert(cparams);
assert(e);
assert(ks);
assert(ks->ctx);
assert(ks->manager);
log_info("Invoking oomd prekill hook %s for cgroup %s", basename, ks->ctx->path);
hook_path = path_join(VARLINK_DIR_OOMD_PREKILL_HOOK, basename);
if (!hook_path)
return log_oom_debug();
r = sd_varlink_connect_address(&link, hook_path);
if (r < 0) {
log_debug_errno(r, "Socket '%s' is not connectible, probably stale, ignoring: %m", hook_path);
return 0;
}
(void) sd_varlink_set_userdata(link, ks);
r = sd_varlink_set_description(link, "oomd prekill hook");
if (r < 0)
return log_debug_errno(r, "Failed to set varlink description: %m");
(void) sd_varlink_set_relative_timeout(link, ks->manager->prekill_timeout);
r = sd_varlink_attach_event(link, e, SD_EVENT_PRIORITY_NORMAL);
if (r < 0)
return log_debug_errno(r, "Failed to attach varlink to event loop: %m");
r = sd_varlink_bind_reply(link, prekill_callback);
if (r < 0)
return log_debug_errno(r, "Failed to bind reply callback: %m");
r = sd_varlink_invoke(link, "io.systemd.oom.Prekill.Notify", cparams);
if (r < 0)
return log_debug_errno(r, "Failed to call varlink method io.systemd.oom.Prekill.Notify: %m");
r = set_ensure_consume(&ks->links, &varlink_hash_ops, TAKE_PTR(link));
if (r < 0)
return log_oom_debug();
return 0;
}
/* oomd_prekill_hook() sets the prekill hooks up by sending varlink messages to all sockets found
* in VARLINK_DIR_OOMD_PREKILL_HOOK directory. It returns immediately if no prekill hooks are configured
* or PrekillHookTimeoutSec= is not set. In that case, the actual killing is done immediately by
* the callback set up by the cleanup handler in oomd_cgroup_kill_mark(). */
static int oomd_prekill_hook(Manager *m, OomdKillState *ks) {
_cleanup_closedir_ DIR *d = NULL;
int r;
assert(m);
assert(ks);
assert(ks->ctx);
if (m->prekill_timeout == 0) {
log_debug("Zero oomd prekill timeout configured, skipping prekill hooks.");
return 0;
}
d = opendir(VARLINK_DIR_OOMD_PREKILL_HOOK);
if (!d) {
if (errno == ENOENT) {
log_debug("No prekill varlink socket directory %s, ignoring.", VARLINK_DIR_OOMD_PREKILL_HOOK);
return 0;
}
return log_debug_errno(errno, "Failed to open prekill varlink socket directory %s: %m",
VARLINK_DIR_OOMD_PREKILL_HOOK);
}
_cleanup_(sd_json_variant_unrefp) sd_json_variant *cparams = NULL;
r = sd_json_buildo(&cparams, SD_JSON_BUILD_PAIR_STRING("cgroup", ks->ctx->path));
if (r < 0)
return log_oom_debug();
FOREACH_DIRENT(de, d, return -errno) {
if (!IN_SET(de->d_type, DT_SOCK, DT_UNKNOWN))
continue;
r = send_prekill_message(de->d_name, cparams, ks, m->event);
if (r < 0)
log_warning_errno(r, "Failed to send oomd prekill message to %s for cgroup %s, ignoring: %m",
de->d_name, ks->ctx->path);
}
return 0;
}
int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx) {
int r;
assert(ctx);
assert(m);
if (m->dry_run) {
_cleanup_free_ char *cg_path = NULL;
r = cg_get_path(ctx->path, /* suffix= */ NULL, &cg_path);
if (r < 0)
return r;
log_info("oomd dry-run: Would have tried to kill %s and all its descendants", cg_path);
return 0;
}
_cleanup_(oomd_kill_state_removep) OomdKillState *ks = new(OomdKillState, 1);
if (!ks)
return log_oom_debug();
*ks = (OomdKillState) {
.manager = m,
.ctx = oomd_cgroup_context_ref(ctx),
};
r = set_ensure_put(&m->kill_states, &oomd_kill_state_hash_ops, ks);
if (r < 0)
return log_oom_debug();
r = oomd_prekill_hook(m, ks);
if (r < 0)
log_warning_errno(r, "oomd prekill hook failed for %s, ignoring: %m", ctx->path);
return 0;
}
typedef void (*dump_candidate_func)(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
static int dump_kill_candidates(
@ -539,9 +327,10 @@ static int dump_kill_candidates(
return memstream_dump(LOG_INFO, &m);
}
int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected) {
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected) {
_cleanup_free_ OomdCGroupContext **sorted = NULL;
int r, n, ret = 0;
const OomdCGroupContext *killed = NULL;
int n, r, ret = 0;
assert(h);
assert(ret_selected);
@ -551,31 +340,38 @@ int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext
return n;
FOREACH_ARRAY(i, sorted, n) {
OomdCGroupContext *c = *i;
const OomdCGroupContext *c = *i;
/* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure.
* Continue since there might be "avoid" cgroups at the end. */
if (c->pgscan == 0 && c->current_memory_usage == 0)
continue;
/* First try killing recursively to ensure all child cgroups can be killed. */
r = cg_kill_recursive(c->path, /* sig= */ 0, CGROUP_IGNORE_SELF, /* killed_pids= */ NULL,
/* log_kill= */ NULL, /* userdata= */ NULL);
if (r < 0)
continue;
r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
if (r == -ENOMEM)
return r; /* Treat oom as a hard error */
if (r < 0) {
RET_GATHER(ret, r);
continue; /* Try to find something else to kill */
}
ret = 1;
*ret_selected = c;
ret = r;
r = strdup_to(ret_selected, c->path);
if (r < 0)
return r;
killed = c;
break;
}
(void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_memory_pressure_cgroup_context);
(void) dump_kill_candidates(sorted, n, killed, oomd_dump_memory_pressure_cgroup_context);
return ret;
}
int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected) {
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
_cleanup_free_ OomdCGroupContext **sorted = NULL;
int r, n, ret = 0;
const OomdCGroupContext *killed = NULL;
int n, r, ret = 0;
assert(h);
assert(ret_selected);
@ -588,30 +384,36 @@ int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupCo
* no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
FOREACH_ARRAY(i, sorted, n) {
OomdCGroupContext *c = *i;
const OomdCGroupContext *c = *i;
/* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
* cgroups at the end. */
if (c->swap_usage <= threshold_usage)
continue;
/* First try killing recursively to ensure all child cgroups can be killed. */
r = cg_kill_recursive(c->path, /* sig= */ 0, CGROUP_IGNORE_SELF, /* killed_pids= */ NULL,
/* log_kill= */ NULL, /* userdata= */ NULL);
if (r < 0)
continue;
r = oomd_cgroup_kill(c->path, /* recurse= */ true, /* dry_run= */ dry_run);
if (r == -ENOMEM)
return r; /* Treat oom as a hard error */
if (r < 0) {
RET_GATHER(ret, r);
continue; /* Try to find something else to kill */
}
ret = 1;
*ret_selected = c;
ret = r;
r = strdup_to(ret_selected, c->path);
if (r < 0)
return r;
killed = c;
break;
}
(void) dump_kill_candidates(sorted, n, *ret_selected, oomd_dump_swap_cgroup_context);
(void) dump_kill_candidates(sorted, n, killed, oomd_dump_swap_cgroup_context);
return ret;
}
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
_cleanup_(oomd_cgroup_context_unrefp) OomdCGroupContext *ctx = NULL;
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *p = NULL, *val = NULL;
bool is_root;
int r;
@ -623,15 +425,8 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
if (!ctx)
return -ENOMEM;
*ctx = (OomdCGroupContext) {
.n_ref = 1,
.preference = MANAGED_OOM_PREFERENCE_NONE,
.path = strdup(empty_to_root(path)),
};
if (!ctx->path)
return -ENOMEM;
is_root = empty_or_root(path);
ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
r = cg_get_path(path, "memory.pressure", &p);
if (r < 0)
@ -675,6 +470,10 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m");
}
r = strdup_to(&ctx->path, empty_to_root(path));
if (r < 0)
return r;
*ret = TAKE_PTR(ctx);
return 0;
}
@ -756,7 +555,7 @@ int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext
}
int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) {
_cleanup_(oomd_cgroup_context_unrefp) OomdCGroupContext *curr_ctx = NULL;
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL;
OomdCGroupContext *old_ctx;
int r;

View File

@ -10,16 +10,12 @@
extern const struct hash_ops oomd_cgroup_ctx_hash_ops;
struct Manager;
typedef struct OomdCGroupContext OomdCGroupContext;
typedef struct OomdSystemContext OomdSystemContext;
typedef struct Manager Manager;
typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *);
struct OomdCGroupContext {
unsigned n_ref;
char *path;
ResourcePressure memory_pressure;
@ -49,9 +45,8 @@ struct OomdSystemContext {
uint64_t swap_used;
};
OomdCGroupContext *oomd_cgroup_context_ref(OomdCGroupContext *p);
OomdCGroupContext *oomd_cgroup_context_unref(OomdCGroupContext *p);
DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_unref);
OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx);
DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
/* All hashmaps used with these functions are expected to be of the form
* key: cgroup paths -> value: OomdCGroupContext. */
@ -124,15 +119,14 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
int oomd_fetch_cgroup_oom_preference(OomdCGroupContext *ctx, const char *prefix);
/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */
int oomd_cgroup_kill(Manager *m, OomdCGroupContext *ctx, bool recurse);
int oomd_cgroup_kill_mark(Manager *m, OomdCGroupContext *ctx);
int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */
/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise,
* everything in `h` is a candidate.
* Returns the killed cgroup in ret_selected. */
int oomd_select_by_pgscan_rate(Hashmap *h, const char *prefix, OomdCGroupContext **ret_selected);
int oomd_select_by_swap_usage(Hashmap *h, uint64_t threshold_usage, OomdCGroupContext **ret_selected);
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
int oomd_system_context_acquire(const char *proc_meminfo_path, OomdSystemContext *ret);

View File

@ -88,7 +88,7 @@ TEST(oomd_cgroup_kill) {
ASSERT_OK(fork_and_sleep(5, &two));
ASSERT_OK(cg_attach(subcgroup, two.pid));
ASSERT_OK_POSITIVE(oomd_cgroup_kill(NULL /* manager */, &(OomdCGroupContext){ .path = subcgroup }, false /* recurse */));
ASSERT_OK_POSITIVE(oomd_cgroup_kill(subcgroup, false /* recurse */, false /* dry run */));
ASSERT_OK(cg_get_xattr(subcgroup, "user.oomd_ooms", &v, /* ret_size= */ NULL));
ASSERT_STREQ(v, i == 0 ? "1" : "2");
@ -115,7 +115,7 @@ TEST(oomd_cgroup_kill) {
TEST(oomd_cgroup_context_acquire_and_insert) {
_cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL;
_cleanup_(oomd_cgroup_context_unrefp) OomdCGroupContext *ctx = NULL;
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
OomdCGroupContext *c1, *c2;
CGroupMask mask;
@ -138,7 +138,7 @@ TEST(oomd_cgroup_context_acquire_and_insert) {
ASSERT_EQ(ctx->swap_usage, 0u);
ASSERT_EQ(ctx->last_pgscan, 0u);
ASSERT_EQ(ctx->pgscan, 0u);
ASSERT_NULL(ctx = oomd_cgroup_context_unref(ctx));
ASSERT_NULL(ctx = oomd_cgroup_context_free(ctx));
ASSERT_OK(oomd_cgroup_context_acquire("", &ctx));
ASSERT_STREQ(ctx->path, "/");
@ -429,7 +429,7 @@ TEST(oomd_sort_cgroups) {
}
TEST(oomd_fetch_cgroup_oom_preference) {
_cleanup_(oomd_cgroup_context_unrefp) OomdCGroupContext *ctx = NULL;
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
ManagedOOMPreference root_pref;
CGroupMask mask;
bool test_xattrs;
@ -464,7 +464,7 @@ TEST(oomd_fetch_cgroup_oom_preference) {
ASSERT_FAIL(oomd_fetch_cgroup_oom_preference(ctx, NULL));
ASSERT_EQ(ctx->preference, MANAGED_OOM_PREFERENCE_NONE);
}
ctx = oomd_cgroup_context_unref(ctx);
ctx = oomd_cgroup_context_free(ctx);
/* also check when only avoid is set to true */
if (test_xattrs) {
@ -473,7 +473,7 @@ TEST(oomd_fetch_cgroup_oom_preference) {
ASSERT_OK(oomd_cgroup_context_acquire(cgroup, &ctx));
ASSERT_OK(oomd_fetch_cgroup_oom_preference(ctx, NULL));
ASSERT_EQ(ctx->preference, geteuid() == 0 ? MANAGED_OOM_PREFERENCE_AVOID : MANAGED_OOM_PREFERENCE_NONE);
ctx = oomd_cgroup_context_unref(ctx);
ctx = oomd_cgroup_context_free(ctx);
}
/* Test the root cgroup */
@ -493,7 +493,7 @@ TEST(oomd_fetch_cgroup_oom_preference) {
/* Assert that avoid/omit are not set if the cgroup and prefix are not
* owned by the same user. */
if (test_xattrs && !empty_or_root(cgroup) && geteuid() == 0) {
ctx = oomd_cgroup_context_unref(ctx);
ctx = oomd_cgroup_context_free(ctx);
ASSERT_OK(cg_set_access(cgroup, 61183, 0));
ASSERT_OK(oomd_cgroup_context_acquire(cgroup, &ctx));

View File

@ -224,7 +224,6 @@ shared_sources = files(
'varlink-io.systemd.Unit.c',
'varlink-io.systemd.UserDatabase.c',
'varlink-io.systemd.oom.c',
'varlink-io.systemd.oom.Prekill.c',
'varlink-io.systemd.service.c',
'varlink-io.systemd.sysext.c',
'varlink-serialize.c',

View File

@ -1,18 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "varlink-io.systemd.oom.Prekill.h"
/* This is a new Varlink interface for pre-kill notifications from oomd.
* It will be available through /run/systemd/oomd.prekill.hook/ */
static SD_VARLINK_DEFINE_METHOD(
Notify,
SD_VARLINK_FIELD_COMMENT("The cgroup which is going to be killed"),
SD_VARLINK_DEFINE_INPUT(cgroup, SD_VARLINK_STRING, 0));
SD_VARLINK_DEFINE_INTERFACE(
io_systemd_oom_Prekill,
"io.systemd.oom.Prekill",
SD_VARLINK_INTERFACE_COMMENT("Prekill notifications from oomd"),
SD_VARLINK_SYMBOL_COMMENT("Notify about an imminent OOM kill"),
&vl_method_Notify);

View File

@ -1,6 +0,0 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include "sd-varlink-idl.h"
extern const sd_varlink_interface vl_interface_io_systemd_oom_Prekill;

View File

@ -43,7 +43,6 @@
#include "varlink-io.systemd.Unit.h"
#include "varlink-io.systemd.UserDatabase.h"
#include "varlink-io.systemd.oom.h"
#include "varlink-io.systemd.oom.Prekill.h"
#include "varlink-io.systemd.service.h"
#include "varlink-io.systemd.sysext.h"
#include "varlink-org.varlink.service.h"
@ -207,7 +206,6 @@ TEST(parse_format) {
&vl_interface_io_systemd_Unit,
&vl_interface_io_systemd_UserDatabase,
&vl_interface_io_systemd_oom,
&vl_interface_io_systemd_oom_Prekill,
&vl_interface_io_systemd_service,
&vl_interface_io_systemd_sysext,
&vl_interface_org_varlink_service,

View File

@ -353,35 +353,6 @@ EOF
systemctl reset-failed
}
testcase_prekill_hook() {
cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF'
[OOM]
PrekillHookTimeoutSec=3s
EOF
# no hooks
systemctl reload systemd-oomd.service
! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
# one hook
mkdir -p /run/systemd/oomd.prekill.hook/
ncat --recv-only -kUl /run/systemd/oomd.prekill.hook/althook >/tmp/oomd_event.json &
! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
[[ $(jq -r .method </tmp/oomd_event.json) = 'io.systemd.oom.Prekill.Notify' ]]
rm -f /run/systemd/oomd.prekill.hook/* /tmp/oomd_event.json
# many hooks
for i in {1..4}; do
ncat --recv-only -kUl "/run/systemd/oomd.prekill.hook/althook$i" >"/tmp/oomd_event$i.json" &
done
! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1
for j in /tmp/oomd_event*.json; do
[[ $(jq -r .method <"$j") = 'io.systemd.oom.Prekill.Notify' ]]
done
}
run_testcases
touch /testok