Compare commits
3 Commits
20f9009249
...
1b678d1cbd
Author | SHA1 | Date |
---|---|---|
![]() |
1b678d1cbd | |
![]() |
8f92f05192 | |
![]() |
abc6842c2a |
3
TODO
3
TODO
|
@ -128,6 +128,9 @@ Deprecations and removals:
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
|
|
||||||
|
* pid1: Maybe we should run control processes in the same pidns/cgroupns as the
|
||||||
|
main pid if it's still alive?
|
||||||
|
|
||||||
* loginctl: show argv[] of "leader" process in tabular list-sessions output
|
* loginctl: show argv[] of "leader" process in tabular list-sessions output
|
||||||
|
|
||||||
* loginctl: show "service identifier" in tabular list-sessions output, to make
|
* loginctl: show "service identifier" in tabular list-sessions output, to make
|
||||||
|
|
|
@ -1188,7 +1188,8 @@ static int setup_pam(
|
||||||
gid_t gid,
|
gid_t gid,
|
||||||
char ***env, /* updated on success */
|
char ***env, /* updated on success */
|
||||||
const int fds[], size_t n_fds,
|
const int fds[], size_t n_fds,
|
||||||
int exec_fd) {
|
int exec_fd,
|
||||||
|
PidRef *ret_pidref) {
|
||||||
|
|
||||||
#if HAVE_PAM
|
#if HAVE_PAM
|
||||||
AskPasswordConvData conv_data = {
|
AskPasswordConvData conv_data = {
|
||||||
|
@ -1209,6 +1210,7 @@ static int setup_pam(
|
||||||
int pam_code = PAM_SUCCESS, r;
|
int pam_code = PAM_SUCCESS, r;
|
||||||
bool close_session = false;
|
bool close_session = false;
|
||||||
pid_t parent_pid;
|
pid_t parent_pid;
|
||||||
|
PidRef child_pidref;
|
||||||
int flags = 0;
|
int flags = 0;
|
||||||
|
|
||||||
assert(context);
|
assert(context);
|
||||||
|
@ -1287,7 +1289,7 @@ static int setup_pam(
|
||||||
|
|
||||||
parent_pid = getpid_cached();
|
parent_pid = getpid_cached();
|
||||||
|
|
||||||
r = safe_fork("(sd-pam)", 0, NULL);
|
r = pidref_safe_fork("(sd-pam)", 0, &child_pidref);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
if (r == 0) {
|
if (r == 0) {
|
||||||
|
@ -1373,6 +1375,9 @@ static int setup_pam(
|
||||||
if (!barrier_place_and_sync(&barrier))
|
if (!barrier_place_and_sync(&barrier))
|
||||||
log_error("PAM initialization failed");
|
log_error("PAM initialization failed");
|
||||||
|
|
||||||
|
if (ret_pidref)
|
||||||
|
*ret_pidref = TAKE_PIDREF(child_pidref);
|
||||||
|
|
||||||
return strv_free_and_replace(*env, e);
|
return strv_free_and_replace(*env, e);
|
||||||
|
|
||||||
fail:
|
fail:
|
||||||
|
@ -3456,7 +3461,7 @@ static int apply_mount_namespace(
|
||||||
|
|
||||||
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
|
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
|
||||||
* service will need to write to it in order to start the notifications. */
|
* service will need to write to it in order to start the notifications. */
|
||||||
if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||||
read_write_paths_cleanup = strv_copy(context->read_write_paths);
|
read_write_paths_cleanup = strv_copy(context->read_write_paths);
|
||||||
if (!read_write_paths_cleanup)
|
if (!read_write_paths_cleanup)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
@ -3601,7 +3606,7 @@ static int apply_mount_namespace(
|
||||||
* sandbox inside the mount namespace. */
|
* sandbox inside the mount namespace. */
|
||||||
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
|
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
|
||||||
|
|
||||||
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
|
.protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
|
||||||
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
|
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
|
||||||
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
|
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
|
||||||
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
|
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
|
||||||
|
@ -3609,7 +3614,7 @@ static int apply_mount_namespace(
|
||||||
.private_dev = needs_sandboxing && context->private_devices,
|
.private_dev = needs_sandboxing && context->private_devices,
|
||||||
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
|
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
|
||||||
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
|
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
|
||||||
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
|
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
|
||||||
.private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
|
.private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
|
||||||
|
|
||||||
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
|
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
|
||||||
|
@ -4220,7 +4225,7 @@ static void log_command_line(
|
||||||
LOG_EXEC_INVOCATION_ID(params));
|
LOG_EXEC_INVOCATION_ID(params));
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
|
static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
return context->private_users != PRIVATE_USERS_NO ||
|
return context->private_users != PRIVATE_USERS_NO ||
|
||||||
|
@ -4239,11 +4244,11 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
|
||||||
!strv_isempty(context->extension_directories) ||
|
!strv_isempty(context->extension_directories) ||
|
||||||
context->protect_system != PROTECT_SYSTEM_NO ||
|
context->protect_system != PROTECT_SYSTEM_NO ||
|
||||||
context->protect_home != PROTECT_HOME_NO ||
|
context->protect_home != PROTECT_HOME_NO ||
|
||||||
exec_needs_pid_namespace(context) ||
|
exec_needs_pid_namespace(context, params) ||
|
||||||
context->protect_kernel_tunables ||
|
context->protect_kernel_tunables ||
|
||||||
context->protect_kernel_modules ||
|
context->protect_kernel_modules ||
|
||||||
context->protect_kernel_logs ||
|
context->protect_kernel_logs ||
|
||||||
exec_needs_cgroup_mount(context) ||
|
exec_needs_cgroup_mount(context, params) ||
|
||||||
context->protect_clock ||
|
context->protect_clock ||
|
||||||
context->protect_hostname != PROTECT_HOSTNAME_NO ||
|
context->protect_hostname != PROTECT_HOSTNAME_NO ||
|
||||||
!strv_isempty(context->read_write_paths) ||
|
!strv_isempty(context->read_write_paths) ||
|
||||||
|
@ -4284,7 +4289,7 @@ static bool exec_namespace_is_delegated(
|
||||||
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
|
/* If we need unprivileged private users, we've already unshared a user namespace by the time we call
|
||||||
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
|
* setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
|
||||||
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
|
* unsharing in the first call to setup_delegated_namespaces() by returning false here. */
|
||||||
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
|
if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
|
if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
|
||||||
|
@ -4379,7 +4384,7 @@ static int setup_delegated_namespaces(
|
||||||
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
|
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
|
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
|
||||||
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
|
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
|
||||||
if (unshare(CLONE_NEWCGROUP) < 0) {
|
if (unshare(CLONE_NEWCGROUP) < 0) {
|
||||||
*reterr_exit_status = EXIT_NAMESPACE;
|
*reterr_exit_status = EXIT_NAMESPACE;
|
||||||
|
@ -4391,7 +4396,7 @@ static int setup_delegated_namespaces(
|
||||||
|
|
||||||
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
|
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
|
||||||
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
|
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
|
||||||
if (needs_sandboxing && exec_needs_pid_namespace(context) &&
|
if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
|
||||||
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
|
exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
|
||||||
if (params->pidref_transport_fd < 0) {
|
if (params->pidref_transport_fd < 0) {
|
||||||
*reterr_exit_status = EXIT_NAMESPACE;
|
*reterr_exit_status = EXIT_NAMESPACE;
|
||||||
|
@ -4641,6 +4646,49 @@ static void prepare_terminal(
|
||||||
(void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
|
(void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int cg_subgroup_attach_pidref(
|
||||||
|
const ExecContext *context,
|
||||||
|
const CGroupContext *cgroup_context,
|
||||||
|
const ExecParameters *params,
|
||||||
|
const char *prefix,
|
||||||
|
const PidRef *pidref,
|
||||||
|
int *reterr_exit_status) {
|
||||||
|
|
||||||
|
_cleanup_free_ char *subgroup = NULL;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
assert(context);
|
||||||
|
assert(cgroup_context);
|
||||||
|
assert(params);
|
||||||
|
assert(reterr_exit_status);
|
||||||
|
|
||||||
|
r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
|
||||||
|
if (r < 0) {
|
||||||
|
*reterr_exit_status = EXIT_CGROUP;
|
||||||
|
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||||
|
}
|
||||||
|
if (r == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
r = cg_attach(subgroup, pidref ? pidref->pid : 0);
|
||||||
|
if (r == -EUCLEAN) {
|
||||||
|
*reterr_exit_status = EXIT_CGROUP;
|
||||||
|
return log_exec_error_errno(context, params, r,
|
||||||
|
"Failed to attach process " PID_FMT " to cgroup '%s', "
|
||||||
|
"because the cgroup or one of its parents or "
|
||||||
|
"siblings is in the threaded mode.",
|
||||||
|
pidref ? pidref->pid : getpid_cached(), subgroup);
|
||||||
|
}
|
||||||
|
if (r < 0) {
|
||||||
|
*reterr_exit_status = EXIT_CGROUP;
|
||||||
|
return log_exec_error_errno(context, params, r,
|
||||||
|
"Failed to attach process " PID_FMT " to cgroup %s: %m",
|
||||||
|
pidref ? pidref->pid : getpid_cached(), subgroup);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int exec_invoke(
|
int exec_invoke(
|
||||||
const ExecCommand *command,
|
const ExecCommand *command,
|
||||||
const ExecContext *context,
|
const ExecContext *context,
|
||||||
|
@ -4956,28 +5004,37 @@ int exec_invoke(
|
||||||
if (socket_fd >= 0)
|
if (socket_fd >= 0)
|
||||||
(void) fd_nonblock(socket_fd, false);
|
(void) fd_nonblock(socket_fd, false);
|
||||||
|
|
||||||
|
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
|
||||||
|
* from it. */
|
||||||
|
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
|
||||||
|
|
||||||
/* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
|
/* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
|
||||||
* Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
|
* Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
|
||||||
if (params->cgroup_path) {
|
if (params->cgroup_path) {
|
||||||
_cleanup_free_ char *p = NULL;
|
_cleanup_free_ char *p = NULL;
|
||||||
|
|
||||||
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
|
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
*exit_status = EXIT_CGROUP;
|
*exit_status = EXIT_CGROUP;
|
||||||
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||||
}
|
}
|
||||||
|
|
||||||
r = cg_attach(p, 0);
|
/* We cannot spawn the main service process into the subcgroup as it needs to unshare the
|
||||||
|
* cgroup namespace first if one is configured to make sure the root of the cgroup namespace
|
||||||
|
* is the service cgroup and not the subcgroup. */
|
||||||
|
const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context, params) ? params->cgroup_path : p;
|
||||||
|
|
||||||
|
r = cg_attach(cgtarget, 0);
|
||||||
if (r == -EUCLEAN) {
|
if (r == -EUCLEAN) {
|
||||||
*exit_status = EXIT_CGROUP;
|
*exit_status = EXIT_CGROUP;
|
||||||
return log_exec_error_errno(context, params, r,
|
return log_exec_error_errno(context, params, r,
|
||||||
"Failed to attach process to cgroup '%s', "
|
"Failed to attach process to cgroup '%s', "
|
||||||
"because the cgroup or one of its parents or "
|
"because the cgroup or one of its parents or "
|
||||||
"siblings is in the threaded mode.", p);
|
"siblings is in the threaded mode.", cgtarget);
|
||||||
}
|
}
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
*exit_status = EXIT_CGROUP;
|
*exit_status = EXIT_CGROUP;
|
||||||
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
|
return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", cgtarget);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5177,10 +5234,6 @@ int exec_invoke(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
|
|
||||||
* from it. */
|
|
||||||
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
|
|
||||||
|
|
||||||
if (params->cgroup_path) {
|
if (params->cgroup_path) {
|
||||||
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
|
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
|
||||||
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
|
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
|
||||||
|
@ -5196,7 +5249,7 @@ int exec_invoke(
|
||||||
return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
|
return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
|
||||||
}
|
}
|
||||||
|
|
||||||
r = exec_params_get_cgroup_path(params, cgroup_context, &p);
|
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
*exit_status = EXIT_CGROUP;
|
*exit_status = EXIT_CGROUP;
|
||||||
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
|
||||||
|
@ -5228,7 +5281,7 @@ int exec_invoke(
|
||||||
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
|
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
|
||||||
* pressure path environment variable or read-write mount to the unit. This is why we check if
|
* pressure path environment variable or read-write mount to the unit. This is why we check if
|
||||||
* memory_pressure_path != NULL in the conditional below. */
|
* memory_pressure_path != NULL in the conditional below. */
|
||||||
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
|
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
|
||||||
memory_pressure_path = mfree(memory_pressure_path);
|
memory_pressure_path = mfree(memory_pressure_path);
|
||||||
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
|
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
|
@ -5371,12 +5424,20 @@ int exec_invoke(
|
||||||
* wins here. (See above.) */
|
* wins here. (See above.) */
|
||||||
|
|
||||||
/* All fds passed in the fds array will be closed in the pam child process. */
|
/* All fds passed in the fds array will be closed in the pam child process. */
|
||||||
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
|
PidRef pam_pidref;
|
||||||
|
r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd, &pam_pidref);
|
||||||
if (r < 0) {
|
if (r < 0) {
|
||||||
*exit_status = EXIT_PAM;
|
*exit_status = EXIT_PAM;
|
||||||
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
|
return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params->cgroup_path) {
|
||||||
|
/* Move PAM into subgroup immediately if one is configured. */
|
||||||
|
r = cg_subgroup_attach_pidref(context, cgroup_context, params, params->cgroup_path, &pam_pidref, exit_status);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
/* PAM modules might have set some ambient caps. Query them here and merge them into
|
/* PAM modules might have set some ambient caps. Query them here and merge them into
|
||||||
* the caps we want to set in the end, so that we don't end up unsetting them. */
|
* the caps we want to set in the end, so that we don't end up unsetting them. */
|
||||||
uint64_t ambient_after_pam;
|
uint64_t ambient_after_pam;
|
||||||
|
@ -5395,7 +5456,7 @@ int exec_invoke(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
|
if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
|
||||||
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
|
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
|
||||||
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
|
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
|
||||||
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
|
* set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
|
||||||
|
@ -5498,6 +5559,23 @@ int exec_invoke(
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return r;
|
return r;
|
||||||
|
|
||||||
|
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
|
||||||
|
* ensures the root of the cgroup namespace is the top level service cgroup and not the
|
||||||
|
* subcgroup. Don't do this for control processes that are spawned immediately into a
|
||||||
|
* subcgroup, as those are already in the right place. */
|
||||||
|
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && params->cgroup_path) {
|
||||||
|
r = cg_subgroup_attach_pidref(
|
||||||
|
context,
|
||||||
|
cgroup_context,
|
||||||
|
params,
|
||||||
|
/* Adjust the prefix accordingly since we're in a cgroup namespace now. */
|
||||||
|
/* prefix= */ NULL,
|
||||||
|
/* pidref= */ NULL,
|
||||||
|
exit_status);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
|
/* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
|
||||||
* shall execute. */
|
* shall execute. */
|
||||||
|
|
||||||
|
|
|
@ -236,9 +236,15 @@ static bool needs_cgroup_namespace(ProtectControlGroups i) {
|
||||||
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
|
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
|
||||||
}
|
}
|
||||||
|
|
||||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
|
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
|
/* CGroup namespaces don't really make sense for control processes and can't really be supported as
|
||||||
|
* (some) control processes need to be spawned directly into a subcgroup to avoid violating the
|
||||||
|
* "no inner processes" rule of cgroupv2, so don't do any cgroup namespacing for control processes. */
|
||||||
|
if (params && needs_cgroup_namespace(context->protect_control_groups) && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
|
||||||
|
return PROTECT_CONTROL_GROUPS_YES;
|
||||||
|
|
||||||
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
|
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
|
||||||
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
|
* use cgroup namespace, we ignore the setting and do not unshare the namespace.
|
||||||
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
|
* ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
|
||||||
|
@ -252,27 +258,31 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context)
|
||||||
return context->protect_control_groups;
|
return context->protect_control_groups;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_needs_cgroup_namespace(const ExecContext *context) {
|
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
return needs_cgroup_namespace(exec_get_protect_control_groups(context));
|
return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_needs_cgroup_mount(const ExecContext *context) {
|
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
|
return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
|
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool exec_needs_pid_namespace(const ExecContext *context) {
|
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params) {
|
||||||
assert(context);
|
assert(context);
|
||||||
|
|
||||||
|
/* PID namespaces don't really make sense for control processes so let's not use them for those. */
|
||||||
|
if (params && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
|
||||||
|
return false;
|
||||||
|
|
||||||
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
|
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -325,11 +335,11 @@ bool exec_needs_mount_namespace(
|
||||||
context->protect_kernel_tunables ||
|
context->protect_kernel_tunables ||
|
||||||
context->protect_kernel_modules ||
|
context->protect_kernel_modules ||
|
||||||
context->protect_kernel_logs ||
|
context->protect_kernel_logs ||
|
||||||
exec_needs_cgroup_mount(context) ||
|
exec_needs_cgroup_mount(context, params) ||
|
||||||
context->protect_proc != PROTECT_PROC_DEFAULT ||
|
context->protect_proc != PROTECT_PROC_DEFAULT ||
|
||||||
context->proc_subset != PROC_SUBSET_ALL ||
|
context->proc_subset != PROC_SUBSET_ALL ||
|
||||||
exec_needs_ipc_namespace(context) ||
|
exec_needs_ipc_namespace(context) ||
|
||||||
exec_needs_pid_namespace(context))
|
exec_needs_pid_namespace(context, params))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (context->root_directory) {
|
if (context->root_directory) {
|
||||||
|
@ -399,20 +409,25 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int exec_params_needs_control_subcgroup(const ExecParameters *params) {
|
||||||
|
return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) &&
|
||||||
|
FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) &&
|
||||||
|
FLAGS_SET(params->flags, EXEC_IS_CONTROL);
|
||||||
|
}
|
||||||
|
|
||||||
int exec_params_get_cgroup_path(
|
int exec_params_get_cgroup_path(
|
||||||
const ExecParameters *params,
|
const ExecParameters *params,
|
||||||
const CGroupContext *c,
|
const CGroupContext *c,
|
||||||
|
const char *prefix,
|
||||||
char **ret) {
|
char **ret) {
|
||||||
|
|
||||||
const char *subgroup = NULL;
|
const char *subgroup = NULL;
|
||||||
char *p;
|
char *p;
|
||||||
|
|
||||||
assert(params);
|
assert(params);
|
||||||
|
assert(c);
|
||||||
assert(ret);
|
assert(ret);
|
||||||
|
|
||||||
if (!params->cgroup_path)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
/* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
|
/* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
|
||||||
* subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
|
* subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
|
||||||
* processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
|
* processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
|
||||||
|
@ -430,9 +445,9 @@ int exec_params_get_cgroup_path(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (subgroup)
|
if (subgroup)
|
||||||
p = path_join(params->cgroup_path, subgroup);
|
p = path_join(prefix, subgroup);
|
||||||
else
|
else
|
||||||
p = strdup(params->cgroup_path);
|
p = strdup(strempty(prefix));
|
||||||
if (!p)
|
if (!p)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
@ -501,7 +516,7 @@ int exec_spawn(
|
||||||
log_command_line(unit, "About to execute", command->path, command->argv);
|
log_command_line(unit, "About to execute", command->path, command->argv);
|
||||||
|
|
||||||
if (params->cgroup_path) {
|
if (params->cgroup_path) {
|
||||||
r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
|
r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
|
return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
|
||||||
if (r > 0) {
|
if (r > 0) {
|
||||||
|
@ -514,6 +529,13 @@ int exec_spawn(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
|
||||||
|
* namespace first if one is configured to make sure the root of the cgroup namespace is the service
|
||||||
|
* cgroup and not the subcgroup. However, when running control commands on a live service, the
|
||||||
|
* commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
|
||||||
|
* of cgroupv2. */
|
||||||
|
const char *cgtarget = exec_params_needs_control_subcgroup(params) ? subcgroup_path : params->cgroup_path;
|
||||||
|
|
||||||
/* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
|
/* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
|
||||||
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
|
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
|
||||||
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
|
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
|
||||||
|
@ -576,24 +598,24 @@ int exec_spawn(
|
||||||
"--log-level", max_log_levels,
|
"--log-level", max_log_levels,
|
||||||
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
|
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
|
||||||
environ,
|
environ,
|
||||||
subcgroup_path,
|
cgtarget,
|
||||||
&pidref);
|
&pidref);
|
||||||
|
|
||||||
/* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
|
/* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
|
||||||
(void) capability_ambient_set_apply(0, /* also_inherit= */ false);
|
(void) capability_ambient_set_apply(0, /* also_inherit= */ false);
|
||||||
|
|
||||||
if (r == -EUCLEAN && subcgroup_path)
|
if (r == -EUCLEAN && cgtarget)
|
||||||
return log_unit_error_errno(unit, r,
|
return log_unit_error_errno(unit, r,
|
||||||
"Failed to spawn process into cgroup '%s', because the cgroup "
|
"Failed to spawn process into cgroup '%s', because the cgroup "
|
||||||
"or one of its parents or siblings is in the threaded mode.",
|
"or one of its parents or siblings is in the threaded mode.",
|
||||||
subcgroup_path);
|
cgtarget);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
|
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
|
||||||
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
|
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
|
||||||
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
|
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
|
||||||
* process will be killed too). */
|
* process will be killed too). */
|
||||||
if (r == 0 && subcgroup_path)
|
if (r == 0 && cgtarget)
|
||||||
(void) cg_attach(subcgroup_path, pidref.pid);
|
(void) cg_attach(cgtarget, pidref.pid);
|
||||||
/* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
|
/* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */
|
||||||
|
|
||||||
log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
|
log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
|
||||||
|
|
|
@ -585,7 +585,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
|
||||||
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
|
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
|
||||||
void exec_runtime_clear(ExecRuntime *rt);
|
void exec_runtime_clear(ExecRuntime *rt);
|
||||||
|
|
||||||
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
|
int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
|
||||||
void exec_params_shallow_clear(ExecParameters *p);
|
void exec_params_shallow_clear(ExecParameters *p);
|
||||||
void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
|
void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
|
||||||
void exec_params_deep_clear(ExecParameters *p);
|
void exec_params_deep_clear(ExecParameters *p);
|
||||||
|
@ -629,12 +629,12 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
|
||||||
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
|
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
|
||||||
bool exec_needs_network_namespace(const ExecContext *context);
|
bool exec_needs_network_namespace(const ExecContext *context);
|
||||||
bool exec_needs_ipc_namespace(const ExecContext *context);
|
bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||||
bool exec_needs_pid_namespace(const ExecContext *context);
|
bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params);
|
||||||
|
|
||||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
|
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
|
||||||
bool exec_needs_cgroup_namespace(const ExecContext *context);
|
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
|
||||||
bool exec_needs_cgroup_mount(const ExecContext *context);
|
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
|
||||||
bool exec_is_cgroup_mount_read_only(const ExecContext *context);
|
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
|
||||||
|
|
||||||
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
|
const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);
|
||||||
|
|
||||||
|
|
|
@ -711,7 +711,7 @@ static int service_verify(Service *s) {
|
||||||
if (s->type == SERVICE_DBUS && !s->bus_name)
|
if (s->type == SERVICE_DBUS && !s->bus_name)
|
||||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
|
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
|
||||||
|
|
||||||
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
|
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context, /* params= */ NULL))
|
||||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
|
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
|
||||||
|
|
||||||
if (s->usb_function_descriptors && !s->usb_function_strings)
|
if (s->usb_function_descriptors && !s->usb_function_strings)
|
||||||
|
|
|
@ -4181,7 +4181,7 @@ static int unit_verify_contexts(const Unit *u) {
|
||||||
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
|
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
|
||||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
|
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
|
||||||
|
|
||||||
if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
|
if (exec_needs_pid_namespace(ec, /* params= */ NULL) && !UNIT_VTABLE(u)->notify_pidref)
|
||||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
|
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
|
||||||
|
|
||||||
const KillContext *kc = unit_get_kill_context(u);
|
const KillContext *kc = unit_get_kill_context(u);
|
||||||
|
|
|
@ -104,4 +104,32 @@ testcase_basic_strict() {
|
||||||
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
|
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
testcase_delegate_subgroup() {
|
||||||
|
# Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
|
||||||
|
systemd-run \
|
||||||
|
-p ProtectControlGroupsEx=private \
|
||||||
|
-p PrivateMounts=yes \
|
||||||
|
-p Delegate=yes \
|
||||||
|
-p DelegateSubgroup=supervisor \
|
||||||
|
--wait \
|
||||||
|
--pipe \
|
||||||
|
ls /sys/fs/cgroup/supervisor
|
||||||
|
}
|
||||||
|
|
||||||
|
testcase_delegate_subgroup_control() {
|
||||||
|
# Make sure control processes are not namespaced and are still put in the .control cgroup.
|
||||||
|
assert_eq "$(
|
||||||
|
systemd-run \
|
||||||
|
-p ProtectControlGroupsEx=private \
|
||||||
|
-p PrivateMounts=yes \
|
||||||
|
-p Delegate=yes \
|
||||||
|
-p DelegateSubgroup=supervisor \
|
||||||
|
-p ExecStartPre="cat /proc/self/cgroup" \
|
||||||
|
--unit delegate-subgroup-control \
|
||||||
|
--wait \
|
||||||
|
--pipe \
|
||||||
|
true
|
||||||
|
)" "0::/system.slice/delegate-subgroup-control.service/.control"
|
||||||
|
}
|
||||||
|
|
||||||
run_testcases
|
run_testcases
|
||||||
|
|
Loading…
Reference in New Issue