Compare commits
3 Commits
20f9009249
...
1b678d1cbd
Author | SHA1 | Date |
---|---|---|
![]() |
1b678d1cbd | |
![]() |
8f92f05192 | |
![]() |
abc6842c2a |
3
TODO
3
TODO
|
@ -128,6 +128,9 @@ Deprecations and removals:
|
|||
|
||||
Features:
|
||||
|
||||
* pid1: Maybe we should run control processes in the same pidns/cgroupns as the
|
||||
main pid if it's still alive?
|
||||
|
||||
* loginctl: show argv[] of "leader" process in tabular list-sessions output
|
||||
|
||||
* loginctl: show "service identifier" in tabular list-sessions output, to make
|
||||
|
|
|
@ -1188,7 +1188,8 @@ static int setup_pam(
|
|||
gid_t gid,
|
||||
char ***env, /* updated on success */
|
||||
const int fds[], size_t n_fds,
|
||||
int exec_fd) {
|
||||
int exec_fd,
|
||||
PidRef *ret_pidref) {
|
||||
|
||||
#if HAVE_PAM
|
||||
AskPasswordConvData conv_data = {
|
||||
|
@ -1209,6 +1210,7 @@ static int setup_pam(
|
|||
int pam_code = PAM_SUCCESS, r;
|
||||
bool close_session = false;
|
||||
pid_t parent_pid;
|
||||
PidRef child_pidref;
|
||||
int flags = 0;
|
||||
|
||||
assert(context);
|
||||
|
@ -1287,7 +1289,7 @@ static int setup_pam(
|
|||
|
||||
parent_pid = getpid_cached();
|
||||
|
||||
r = safe_fork("(sd-pam)", 0, NULL);
|
||||
r = pidref_safe_fork("(sd-pam)", 0, &child_pidref);
|
||||
if (r < 0)
|
||||
goto fail;
|
||||
if (r == 0) {
|
||||
|
@ -1373,6 +1375,9 @@ static int setup_pam(
|
|||
if (!barrier_place_and_sync(&barrier))
|
||||
log_error("PAM initialization failed");
|
||||
|
||||
if (ret_pidref)
|
||||
*ret_pidref = TAKE_PIDREF(child_pidref);
|
||||
|
||||
return strv_free_and_replace(*env, e);
|
||||
|
||||
fail:
|
||||
|
@ -3456,7 +3461,7 @@ static int apply_mount_namespace(
|
|||
|
||||
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
|
||||
* service will need to write to it in order to start the notifications. */
|
||||
if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||
read_write_paths_cleanup = strv_copy(context->read_write_paths);
|
||||
if (!read_write_paths_cleanup)
|
||||
return -ENOMEM;
|
||||
|
@ -3601,7 +3606,7 @@ static int apply_mount_namespace(
|
|||
* sandbox inside the mount namespace. */
|
||||
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
|
||||
|
||||
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
|
||||
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
|
||||
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
|
||||
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
|
||||
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
|
||||
|
@ -3609,7 +3614,7 @@ static int apply_mount_namespace(
|
|||
.private_dev = needs_sandboxing && context->private_devices,
|
||||
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
|
||||
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
|
||||
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
|
||||
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
|
||||
.private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
|
||||
|
||||
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
|
||||
|
@ -4220,7 +4225,7 @@ static void log_command_line(
|
|||
LOG_EXEC_INVOCATION_ID(params));
|
||||
}
|
||||
|
||||
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
|
||||
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
return context->private_users != PRIVATE_USERS_NO ||
|
||||
|
@ -4239,11 +4244,11 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
|
|||
!strv_isempty(context->extension_directories) ||
|
||||
context->protect_system != PROTECT_SYSTEM_NO ||
|
||||
context->protect_home != PROTECT_HOME_NO ||
|
||||
exec_needs_pid_namespace(context) ||
|
||||
exec_needs_pid_namespace(context, params) ||
|
||||
context->protect_kernel_tunables ||
|
||||
context->protect_kernel_modules ||
|
||||
context->protect_kernel_logs ||
|
||||
exec_needs_cgroup_mount(context) ||
|
||||
exec_needs_cgroup_mount(context, params) ||
|
||||
context->protect_clock ||
|
||||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
|
||||
!strv_isempty(context->read_write_paths) ||
|
||||
|
@ -4284,7 +4289,7 @@ static bool exec_namespace_is_delegated(
|
|||
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
|
||||
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
|
||||
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
|
||||
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
|
||||
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
|
||||
return false;
|
||||
|
||||
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
|
||||
|
@ -4379,7 +4384,7 @@ static int setup_delegated_namespaces(
|
|||
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
|
||||
}
|
||||
|
||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
|
||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
|
||||
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
|
||||
if (unshare(CLONE_NEWCGROUP) < 0) {
|
||||
*reterr_exit_status = EXIT_NAMESPACE;
|
||||
|
@ -4391,7 +4396,7 @@ static int setup_delegated_namespaces(
|
|||
|
||||
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
|
||||
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
|
||||
if (needs_sandboxing && exec_needs_pid_namespace(context) &&
|
||||
if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
|
||||
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
|
||||
if (params->pidref_transport_fd < 0) {
|
||||
*reterr_exit_status = EXIT_NAMESPACE;
|
||||
|
@ -4641,6 +4646,49 @@ static void prepare_terminal(
|
|||
(void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
|
||||
}
|
||||
|
||||
static int cg_subgroup_attach_pidref(
|
||||
const ExecContext *context,
|
||||
const CGroupContext *cgroup_context,
|
||||
const ExecParameters *params,
|
||||
const char *prefix,
|
||||
const PidRef *pidref,
|
||||
int *reterr_exit_status) {
|
||||
|
||||
_cleanup_free_ char *subgroup = NULL;
|
||||
int r;
|
||||
|
||||
assert(context);
|
||||
assert(cgroup_context);
|
||||
assert(params);
|
||||
assert(reterr_exit_status);
|
||||
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
|
||||
if (r < 0) {
|
||||
*reterr_exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||
}
|
||||
if (r == 0)
|
||||
return 0;
|
||||
|
||||
r = cg_attach(subgroup, pidref ? pidref->pid : 0);
|
||||
if (r == -EUCLEAN) {
|
||||
*reterr_exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r,
|
||||
"Failed to attach process " PID_FMT " to cgroup '%s', "
|
||||
"because the cgroup or one of its parents or "
|
||||
"siblings is in the threaded mode.",
|
||||
pidref ? pidref->pid : getpid_cached(), subgroup);
|
||||
}
|
||||
if (r < 0) {
|
||||
*reterr_exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r,
|
||||
"Failed to attach process " PID_FMT " to cgroup %s: %m",
|
||||
pidref ? pidref->pid : getpid_cached(), subgroup);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int exec_invoke(
|
||||
const ExecCommand *command,
|
||||
const ExecContext *context,
|
||||
|
@ -4956,28 +5004,37 @@ int exec_invoke(
|
|||
if (socket_fd >= 0)
|
||||
(void) fd_nonblock(socket_fd, false);
|
||||
|
||||
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
|
||||
* from it. */
|
||||
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
|
||||
|
||||
/* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
|
||||
* Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
|
||||
if (params->cgroup_path) {
|
||||
_cleanup_free_ char *p = NULL;
|
||||
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||
}
|
||||
|
||||
r = cg_attach(p, 0);
|
||||
/* We cannot spawn the main service process into the subcgroup as it needs to unshare the
|
||||
* cgroup namespace first if one is configured to make sure the root of the cgroup namespace
|
||||
* is the service cgroup and not the subcgroup. */
|
||||
const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context, params) ? params->cgroup_path : p;
|
||||
|
||||
r = cg_attach(cgtarget, 0);
|
||||
if (r == -EUCLEAN) {
|
||||
*exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r,
|
||||
"Failed to attach process to cgroup '%s', "
|
||||
"because the cgroup or one of its parents or "
|
||||
"siblings is in the threaded mode.", p);
|
||||
"siblings is in the threaded mode.", cgtarget);
|
||||
}
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
|
||||
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", cgtarget);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5177,10 +5234,6 @@ int exec_invoke(
|
|||
}
|
||||
}
|
||||
|
||||
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
|
||||
* from it. */
|
||||
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
|
||||
|
||||
if (params->cgroup_path) {
|
||||
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
|
||||
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
|
||||
|
@ -5196,7 +5249,7 @@ int exec_invoke(
|
|||
return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
|
||||
}
|
||||
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_CGROUP;
|
||||
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||
|
@ -5228,7 +5281,7 @@ int exec_invoke(
|
|||
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
|
||||
* pressure path environment variable or read-write mount to the unit. This is why we check if
|
||||
* memory_pressure_path != NULL in the conditional below. */
|
||||
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
|
||||
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
|
||||
memory_pressure_path = mfree(memory_pressure_path);
|
||||
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
|
||||
if (r < 0) {
|
||||
|
@ -5371,12 +5424,20 @@ int exec_invoke(
|
|||
* wins here. (See above.) */
|
||||
|
||||
/* All fds passed in the fds array will be closed in the pam child process. */
|
||||
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
|
||||
PidRef pam_pidref;
|
||||
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd, &pam_pidref);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_PAM;
|
||||
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
|
||||
}
|
||||
|
||||
if (params->cgroup_path) {
|
||||
/* Move PAM into subgroup immediately if one is configured. */
|
||||
r = cg_subgroup_attach_pidref(context, cgroup_context, params, params->cgroup_path, &pam_pidref, exit_status);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
/* PAM modules might have set some ambient caps. Query them here and merge them into
|
||||
* the caps we want to set in the end, so that we don't end up unsetting them. */
|
||||
uint64_t ambient_after_pam;
|
||||
|
@ -5395,7 +5456,7 @@ int exec_invoke(
|
|||
}
|
||||
}
|
||||
|
||||
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
|
||||
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
|
||||
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
|
||||
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
|
||||
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
|
||||
|
@ -5498,6 +5559,23 @@ int exec_invoke(
|
|||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
|
||||
* ensures the root of the cgroup namespace is the top level service cgroup and not the
|
||||
* subcgroup. Don't do this for control processes that are spawned immediately into a
|
||||
* subcgroup, as those are already in the right place. */
|
||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && params->cgroup_path) {
|
||||
r = cg_subgroup_attach_pidref(
|
||||
context,
|
||||
cgroup_context,
|
||||
params,
|
||||
/* Adjust the prefix accordingly since we're in a cgroup namespace now. */
|
||||
/* prefix= */ NULL,
|
||||
/* pidref= */ NULL,
|
||||
exit_status);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
|
||||
* shall execute. */
|
||||
|
||||
|
|
|
@ -236,9 +236,15 @@ static bool needs_cgroup_namespace(ProtectControlGroups i) {
|
|||
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
}
|
||||
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
/* CGroup namespaces don't really make sense for control processes and can't really be supported as
|
||||
* (some) control processes need to be spawned directly into a subcgroup to avoid violating the
|
||||
* "no inner processes" rule of cgroupv2, so don't do any cgroup namespacing for control processes. */
|
||||
if (params && needs_cgroup_namespace(context->protect_control_groups) && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
|
||||
return PROTECT_CONTROL_GROUPS_YES;
|
||||
|
||||
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
|
||||
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
|
||||
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
|
||||
|
@ -252,27 +258,31 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context)
|
|||
return context->protect_control_groups;
|
||||
}
|
||||
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context) {
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
return needs_cgroup_namespace(exec_get_protect_control_groups(context));
|
||||
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
|
||||
}
|
||||
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context) {
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
|
||||
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
|
||||
}
|
||||
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
}
|
||||
|
||||
bool exec_needs_pid_namespace(const ExecContext *context) {
|
||||
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||
assert(context);
|
||||
|
||||
/* PID namespaces don't really make sense for control processes so let's not use them for those. */
|
||||
if (params && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
|
||||
return false;
|
||||
|
||||
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
|
||||
}
|
||||
|
||||
|
@ -325,11 +335,11 @@ bool exec_needs_mount_namespace(
|
|||
context->protect_kernel_tunables ||
|
||||
context->protect_kernel_modules ||
|
||||
context->protect_kernel_logs ||
|
||||
exec_needs_cgroup_mount(context) ||
|
||||
exec_needs_cgroup_mount(context, params) ||
|
||||
context->protect_proc != PROTECT_PROC_DEFAULT ||
|
||||
context->proc_subset != PROC_SUBSET_ALL ||
|
||||
exec_needs_ipc_namespace(context) ||
|
||||
exec_needs_pid_namespace(context))
|
||||
exec_needs_pid_namespace(context, params))
|
||||
return true;
|
||||
|
||||
if (context->root_directory) {
|
||||
|
@ -399,20 +409,25 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
|
|||
return true;
|
||||
}
|
||||
|
||||
static int exec_params_needs_control_subcgroup(const ExecParameters *params) {
|
||||
return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) &&
|
||||
FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) &&
|
||||
FLAGS_SET(params->flags, EXEC_IS_CONTROL);
|
||||
}
|
||||
|
||||
int exec_params_get_cgroup_path(
|
||||
const ExecParameters *params,
|
||||
const CGroupContext *c,
|
||||
const char *prefix,
|
||||
char **ret) {
|
||||
|
||||
const char *subgroup = NULL;
|
||||
char *p;
|
||||
|
||||
assert(params);
|
||||
assert(c);
|
||||
assert(ret);
|
||||
|
||||
if (!params->cgroup_path)
|
||||
return -EINVAL;
|
||||
|
||||
/* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
|
||||
* subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
|
||||
* processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
|
||||
|
@ -430,9 +445,9 @@ int exec_params_get_cgroup_path(
|
|||
}
|
||||
|
||||
if (subgroup)
|
||||
p = path_join(params->cgroup_path, subgroup);
|
||||
p = path_join(prefix, subgroup);
|
||||
else
|
||||
p = strdup(params->cgroup_path);
|
||||
p = strdup(strempty(prefix));
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -501,7 +516,7 @@ int exec_spawn(
|
|||
log_command_line(unit, "About to execute", command->path, command->argv);
|
||||
|
||||
if (params->cgroup_path) {
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
|
||||
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
|
||||
if (r > 0) {
|
||||
|
@ -514,6 +529,13 @@ int exec_spawn(
|
|||
}
|
||||
}
|
||||
|
||||
/* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
|
||||
* namespace first if one is configured to make sure the root of the cgroup namespace is the service
|
||||
* cgroup and not the subcgroup. However, when running control commands on a live service, the
|
||||
* commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
|
||||
* of cgroupv2. */
|
||||
const char *cgtarget = exec_params_needs_control_subcgroup(params) ? subcgroup_path : params->cgroup_path;
|
||||
|
||||
/* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
|
||||
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
|
||||
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
|
||||
|
@ -576,24 +598,24 @@ int exec_spawn(
|
|||
"--log-level", max_log_levels,
|
||||
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
|
||||
environ,
|
||||
subcgroup_path,
|
||||
cgtarget,
|
||||
&pidref);
|
||||
|
||||
/* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
|
||||
(void) capability_ambient_set_apply(0, /* also_inherit= */ false);
|
||||
|
||||
if (r == -EUCLEAN && subcgroup_path)
|
||||
if (r == -EUCLEAN && cgtarget)
|
||||
return log_unit_error_errno(unit, r,
|
||||
"Failed to spawn process into cgroup '%s', because the cgroup "
|
||||
"or one of its parents or siblings is in the threaded mode.",
|
||||
subcgroup_path);
|
||||
cgtarget);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
|
||||
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
|
||||
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
|
||||
* process will be killed too). */
|
||||
if (r == 0 && subcgroup_path)
|
||||
(void) cg_attach(subcgroup_path, pidref.pid);
|
||||
if (r == 0 && cgtarget)
|
||||
(void) cg_attach(cgtarget, pidref.pid);
|
||||
/* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
|
||||
|
||||
log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
|
||||
|
|
|
@ -585,7 +585,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
|
|||
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
|
||||
void exec_runtime_clear(ExecRuntime *rt);
|
||||
|
||||
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
|
||||
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
|
||||
void exec_params_shallow_clear(ExecParameters *p);
|
||||
void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
|
||||
void exec_params_deep_clear(ExecParameters *p);
|
||||
|
@ -629,12 +629,12 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
|
|||
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
|
||||
bool exec_needs_network_namespace(const ExecContext *context);
|
||||
bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||
bool exec_needs_pid_namespace(const ExecContext *context);
|
||||
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params);
|
||||
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context);
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context);
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context);
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
|
||||
|
||||
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
|
||||
|
||||
|
|
|
@ -711,7 +711,7 @@ static int service_verify(Service *s) {
|
|||
if (s->type == SERVICE_DBUS && !s->bus_name)
|
||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
|
||||
|
||||
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
|
||||
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context, /* params= */ NULL))
|
||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
|
||||
|
||||
if (s->usb_function_descriptors && !s->usb_function_strings)
|
||||
|
|
|
@ -4181,7 +4181,7 @@ static int unit_verify_contexts(const Unit *u) {
|
|||
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
|
||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
|
||||
|
||||
if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
|
||||
if (exec_needs_pid_namespace(ec, /* params= */ NULL) && !UNIT_VTABLE(u)->notify_pidref)
|
||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
|
||||
|
||||
const KillContext *kc = unit_get_kill_context(u);
|
||||
|
|
|
@ -104,4 +104,32 @@ testcase_basic_strict() {
|
|||
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
|
||||
}
|
||||
|
||||
testcase_delegate_subgroup() {
|
||||
# Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
|
||||
systemd-run \
|
||||
-p ProtectControlGroupsEx=private \
|
||||
-p PrivateMounts=yes \
|
||||
-p Delegate=yes \
|
||||
-p DelegateSubgroup=supervisor \
|
||||
--wait \
|
||||
--pipe \
|
||||
ls /sys/fs/cgroup/supervisor
|
||||
}
|
||||
|
||||
testcase_delegate_subgroup_control() {
|
||||
# Make sure control processes are not namespaced and are still put in the .control cgroup.
|
||||
assert_eq "$(
|
||||
systemd-run \
|
||||
-p ProtectControlGroupsEx=private \
|
||||
-p PrivateMounts=yes \
|
||||
-p Delegate=yes \
|
||||
-p DelegateSubgroup=supervisor \
|
||||
-p ExecStartPre="cat /proc/self/cgroup" \
|
||||
--unit delegate-subgroup-control \
|
||||
--wait \
|
||||
--pipe \
|
||||
true
|
||||
)" "0::/system.slice/delegate-subgroup-control.service/.control"
|
||||
}
|
||||
|
||||
run_testcases
|
||||
|
|
Loading…
Reference in New Issue