Compare commits

...

3 Commits

Author SHA1 Message Date
Daan De Meyer 1b678d1cbd
Merge 8f92f05192 into 8e7ef6abb8 2025-04-18 03:46:25 +01:00
Daan De Meyer 8f92f05192 core: Disable pid namespacing for control processes
PID namespaces frankly don't make any sense for control processes, so
let's gracefully degrade to no pid namespaces for control processes.
2025-04-17 21:11:06 +02:00
Daan De Meyer abc6842c2a core: Make sure we handle DelegateSubgroup= in combo with cgroupns
Currently, if we use a cgroup namespace together with DelegateSubgroup=,
the subgroup becomes the root of the cgroup namespace because we move the
service process to the subgroup before we unshare the cgroup namespace, and
the current cgroup becomes the root of the cgroup namespace when we unshare
the cgroup namespace.

Let's fix the problem by not moving the service process to the subgroup until
we've unshared the cgroup namespace. To make this work we have to fix both the
CLONE_INTO_CGROUP case and non-CLONE_INTO_CGROUP case. This doesn't break the
primary use case of CLONE_INTO_CGROUP since we still use it to immediately clone
into the service cgroup, just not anymore into the subgroup, but this shouldn't
matter in practice.

Additionally, we need special handling for control processes, as those *do*
need to spawned into the subcgroup immediately if delegation is configured to
avoid violating the cgroupsv2 "no inner processes" rule. At the same time, we
opt to disable cgroup namespacing for control processes as the root of the cgroup
namespace for those would become the .control subcgroup which doesn't really make
any sense.
2025-04-17 21:11:04 +02:00
7 changed files with 183 additions and 52 deletions

3
TODO
View File

@ -128,6 +128,9 @@ Deprecations and removals:
Features:
* pid1: Maybe we should run control processes in the same pidns/cgroupns as the
main pid if it's still alive?
* loginctl: show argv[] of "leader" process in tabular list-sessions output
* loginctl: show "service identifier" in tabular list-sessions output, to make

View File

@ -1188,7 +1188,8 @@ static int setup_pam(
gid_t gid,
char ***env, /* updated on success */
const int fds[], size_t n_fds,
int exec_fd) {
int exec_fd,
PidRef *ret_pidref) {
#if HAVE_PAM
AskPasswordConvData conv_data = {
@ -1209,6 +1210,7 @@ static int setup_pam(
int pam_code = PAM_SUCCESS, r;
bool close_session = false;
pid_t parent_pid;
PidRef child_pidref;
int flags = 0;
assert(context);
@ -1287,7 +1289,7 @@ static int setup_pam(
parent_pid = getpid_cached();
r = safe_fork("(sd-pam)", 0, NULL);
r = pidref_safe_fork("(sd-pam)", 0, &child_pidref);
if (r < 0)
goto fail;
if (r == 0) {
@ -1373,6 +1375,9 @@ static int setup_pam(
if (!barrier_place_and_sync(&barrier))
log_error("PAM initialization failed");
if (ret_pidref)
*ret_pidref = TAKE_PIDREF(child_pidref);
return strv_free_and_replace(*env, e);
fail:
@ -3456,7 +3461,7 @@ static int apply_mount_namespace(
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
@ -3601,7 +3606,7 @@ static int apply_mount_namespace(
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@ -3609,7 +3614,7 @@ static int apply_mount_namespace(
.private_dev = needs_sandboxing && context->private_devices,
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
.private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
@ -4220,7 +4225,7 @@ static void log_command_line(
LOG_EXEC_INVOCATION_ID(params));
}
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
assert(context);
return context->private_users != PRIVATE_USERS_NO ||
@ -4239,11 +4244,11 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
!strv_isempty(context->extension_directories) ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
exec_needs_pid_namespace(context) ||
exec_needs_pid_namespace(context, params) ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
exec_needs_cgroup_mount(context) ||
exec_needs_cgroup_mount(context, params) ||
context->protect_clock ||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
!strv_isempty(context->read_write_paths) ||
@ -4284,7 +4289,7 @@ static bool exec_namespace_is_delegated(
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
return false;
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
@ -4379,7 +4384,7 @@ static int setup_delegated_namespaces(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
if (unshare(CLONE_NEWCGROUP) < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@ -4391,7 +4396,7 @@ static int setup_delegated_namespaces(
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
if (needs_sandboxing && exec_needs_pid_namespace(context) &&
if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
if (params->pidref_transport_fd < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@ -4641,6 +4646,49 @@ static void prepare_terminal(
(void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
}
static int cg_subgroup_attach_pidref(
const ExecContext *context,
const CGroupContext *cgroup_context,
const ExecParameters *params,
const char *prefix,
const PidRef *pidref,
int *reterr_exit_status) {
_cleanup_free_ char *subgroup = NULL;
int r;
assert(context);
assert(cgroup_context);
assert(params);
assert(reterr_exit_status);
r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
if (r < 0) {
*reterr_exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
}
if (r == 0)
return 0;
r = cg_attach(subgroup, pidref ? pidref->pid : 0);
if (r == -EUCLEAN) {
*reterr_exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r,
"Failed to attach process " PID_FMT " to cgroup '%s', "
"because the cgroup or one of its parents or "
"siblings is in the threaded mode.",
pidref ? pidref->pid : getpid_cached(), subgroup);
}
if (r < 0) {
*reterr_exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r,
"Failed to attach process " PID_FMT " to cgroup %s: %m",
pidref ? pidref->pid : getpid_cached(), subgroup);
}
return 0;
}
int exec_invoke(
const ExecCommand *command,
const ExecContext *context,
@ -4956,28 +5004,37 @@ int exec_invoke(
if (socket_fd >= 0)
(void) fd_nonblock(socket_fd, false);
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
/* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
* Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
if (params->cgroup_path) {
_cleanup_free_ char *p = NULL;
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
}
r = cg_attach(p, 0);
/* We cannot spawn the main service process into the subcgroup as it needs to unshare the
* cgroup namespace first if one is configured to make sure the root of the cgroup namespace
* is the service cgroup and not the subcgroup. */
const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context, params) ? params->cgroup_path : p;
r = cg_attach(cgtarget, 0);
if (r == -EUCLEAN) {
*exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r,
"Failed to attach process to cgroup '%s', "
"because the cgroup or one of its parents or "
"siblings is in the threaded mode.", p);
"siblings is in the threaded mode.", cgtarget);
}
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", cgtarget);
}
}
@ -5177,10 +5234,6 @@ int exec_invoke(
}
}
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@ -5196,7 +5249,7 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
}
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
@ -5228,7 +5281,7 @@ int exec_invoke(
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
* pressure path environment variable or read-write mount to the unit. This is why we check if
* memory_pressure_path != NULL in the conditional below. */
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
memory_pressure_path = mfree(memory_pressure_path);
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
if (r < 0) {
@ -5371,12 +5424,20 @@ int exec_invoke(
* wins here. (See above.) */
/* All fds passed in the fds array will be closed in the pam child process. */
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
PidRef pam_pidref;
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd, &pam_pidref);
if (r < 0) {
*exit_status = EXIT_PAM;
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
}
if (params->cgroup_path) {
/* Move PAM into subgroup immediately if one is configured. */
r = cg_subgroup_attach_pidref(context, cgroup_context, params, params->cgroup_path, &pam_pidref, exit_status);
if (r < 0)
return r;
}
/* PAM modules might have set some ambient caps. Query them here and merge them into
* the caps we want to set in the end, so that we don't end up unsetting them. */
uint64_t ambient_after_pam;
@ -5395,7 +5456,7 @@ int exec_invoke(
}
}
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
@ -5498,6 +5559,23 @@ int exec_invoke(
if (r < 0)
return r;
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
* ensures the root of the cgroup namespace is the top level service cgroup and not the
* subcgroup. Don't do this for control processes that are spawned immediately into a
* subcgroup, as those are already in the right place. */
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && params->cgroup_path) {
r = cg_subgroup_attach_pidref(
context,
cgroup_context,
params,
/* Adjust the prefix accordingly since we're in a cgroup namespace now. */
/* prefix= */ NULL,
/* pidref= */ NULL,
exit_status);
if (r < 0)
return r;
}
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
* shall execute. */

View File

@ -236,9 +236,15 @@ static bool needs_cgroup_namespace(ProtectControlGroups i) {
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
}
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
assert(context);
/* CGroup namespaces don't really make sense for control processes and can't really be supported as
* (some) control processes need to be spawned directly into a subcgroup to avoid violating the
* "no inner processes" rule of cgroupv2, so don't do any cgroup namespacing for control processes. */
if (params && needs_cgroup_namespace(context->protect_control_groups) && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
return PROTECT_CONTROL_GROUPS_YES;
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
@ -252,27 +258,31 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context)
return context->protect_control_groups;
}
bool exec_needs_cgroup_namespace(const ExecContext *context) {
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
assert(context);
return needs_cgroup_namespace(exec_get_protect_control_groups(context));
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
}
bool exec_needs_cgroup_mount(const ExecContext *context) {
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
assert(context);
return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
}
bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
assert(context);
return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_pid_namespace(const ExecContext *context) {
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params) {
assert(context);
/* PID namespaces don't really make sense for control processes so let's not use them for those. */
if (params && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
return false;
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
}
@ -325,11 +335,11 @@ bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
exec_needs_cgroup_mount(context) ||
exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context))
exec_needs_pid_namespace(context, params))
return true;
if (context->root_directory) {
@ -399,20 +409,25 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
return true;
}
static int exec_params_needs_control_subcgroup(const ExecParameters *params) {
return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) &&
FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) &&
FLAGS_SET(params->flags, EXEC_IS_CONTROL);
}
int exec_params_get_cgroup_path(
const ExecParameters *params,
const CGroupContext *c,
const char *prefix,
char **ret) {
const char *subgroup = NULL;
char *p;
assert(params);
assert(c);
assert(ret);
if (!params->cgroup_path)
return -EINVAL;
/* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
* subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
* processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
@ -430,9 +445,9 @@ int exec_params_get_cgroup_path(
}
if (subgroup)
p = path_join(params->cgroup_path, subgroup);
p = path_join(prefix, subgroup);
else
p = strdup(params->cgroup_path);
p = strdup(strempty(prefix));
if (!p)
return -ENOMEM;
@ -501,7 +516,7 @@ int exec_spawn(
log_command_line(unit, "About to execute", command->path, command->argv);
if (params->cgroup_path) {
r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
if (r > 0) {
@ -514,6 +529,13 @@ int exec_spawn(
}
}
/* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
* namespace first if one is configured to make sure the root of the cgroup namespace is the service
* cgroup and not the subcgroup. However, when running control commands on a live service, the
* commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
* of cgroupv2. */
const char *cgtarget = exec_params_needs_control_subcgroup(params) ? subcgroup_path : params->cgroup_path;
/* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
@ -576,24 +598,24 @@ int exec_spawn(
"--log-level", max_log_levels,
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
environ,
subcgroup_path,
cgtarget,
&pidref);
/* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
(void) capability_ambient_set_apply(0, /* also_inherit= */ false);
if (r == -EUCLEAN && subcgroup_path)
if (r == -EUCLEAN && cgtarget)
return log_unit_error_errno(unit, r,
"Failed to spawn process into cgroup '%s', because the cgroup "
"or one of its parents or siblings is in the threaded mode.",
subcgroup_path);
cgtarget);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
* process will be killed too). */
if (r == 0 && subcgroup_path)
(void) cg_attach(subcgroup_path, pidref.pid);
if (r == 0 && cgtarget)
(void) cg_attach(cgtarget, pidref.pid);
/* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",

View File

@ -585,7 +585,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
void exec_runtime_clear(ExecRuntime *rt);
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
void exec_params_shallow_clear(ExecParameters *p);
void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
void exec_params_deep_clear(ExecParameters *p);
@ -629,12 +629,12 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
bool exec_needs_pid_namespace(const ExecContext *context);
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
bool exec_needs_cgroup_namespace(const ExecContext *context);
bool exec_needs_cgroup_mount(const ExecContext *context);
bool exec_is_cgroup_mount_read_only(const ExecContext *context);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);

View File

@ -711,7 +711,7 @@ static int service_verify(Service *s) {
if (s->type == SERVICE_DBUS && !s->bus_name)
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context, /* params= */ NULL))
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
if (s->usb_function_descriptors && !s->usb_function_strings)

View File

@ -4181,7 +4181,7 @@ static int unit_verify_contexts(const Unit *u) {
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
if (exec_needs_pid_namespace(ec, /* params= */ NULL) && !UNIT_VTABLE(u)->notify_pidref)
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
const KillContext *kc = unit_get_kill_context(u);

View File

@ -104,4 +104,32 @@ testcase_basic_strict() {
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
}
testcase_delegate_subgroup() {
# Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
systemd-run \
-p ProtectControlGroupsEx=private \
-p PrivateMounts=yes \
-p Delegate=yes \
-p DelegateSubgroup=supervisor \
--wait \
--pipe \
ls /sys/fs/cgroup/supervisor
}
testcase_delegate_subgroup_control() {
# Make sure control processes are not namespaced and are still put in the .control cgroup.
assert_eq "$(
systemd-run \
-p ProtectControlGroupsEx=private \
-p PrivateMounts=yes \
-p Delegate=yes \
-p DelegateSubgroup=supervisor \
-p ExecStartPre="cat /proc/self/cgroup" \
--unit delegate-subgroup-control \
--wait \
--pipe \
true
)" "0::/system.slice/delegate-subgroup-control.service/.control"
}
run_testcases