Merge 8f92f05192 into 8e7ef6abb8

core: Disable pid namespacing for control processes
PID namespaces frankly don't make any sense for control processes, so let's gracefully degrade to no pid namespaces for control processes.
2025-04-18 03:46:25 +01:00 · 2025-04-17 21:11:06 +02:00 · 2025-04-17 21:11:04 +02:00
7 changed files with 183 additions and 52 deletions
--- a/3
+++ b/3
@ -128,6 +128,9 @@ Deprecations and removals:

 Features:

+* pid1: Maybe we should run control processes in the same pidns/cgroupns as the
+  main pid if it's still alive?
+
 * loginctl: show argv[] of "leader" process in tabular list-sessions output

 * loginctl: show "service identifier" in tabular list-sessions output, to make
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@ -1188,7 +1188,8 @@ static int setup_pam(
                gid_t gid,
                char ***env, /* updated on success */
                const int fds[], size_t n_fds,
-                int exec_fd) {
+                int exec_fd,
+                PidRef *ret_pidref) {

 #if HAVE_PAM
        AskPasswordConvData conv_data = {
@ -1209,6 +1210,7 @@ static int setup_pam(
        int pam_code = PAM_SUCCESS, r;
        bool close_session = false;
        pid_t parent_pid;
+        PidRef child_pidref;
        int flags = 0;

        assert(context);
@ -1287,7 +1289,7 @@ static int setup_pam(

        parent_pid = getpid_cached();

-        r = safe_fork("(sd-pam)", 0, NULL);
+        r = pidref_safe_fork("(sd-pam)", 0, &child_pidref);
        if (r < 0)
                goto fail;
        if (r == 0) {
@ -1373,6 +1375,9 @@ static int setup_pam(
        if (!barrier_place_and_sync(&barrier))
                log_error("PAM initialization failed");

+        if (ret_pidref)
+                *ret_pidref = TAKE_PIDREF(child_pidref);
+
        return strv_free_and_replace(*env, e);

 fail:
@ -3456,7 +3461,7 @@ static int apply_mount_namespace(

        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
         * service will need to write to it in order to start the notifications. */
-        if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
+        if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
                read_write_paths_cleanup = strv_copy(context->read_write_paths);
                if (!read_write_paths_cleanup)
                        return -ENOMEM;
@ -3601,7 +3606,7 @@ static int apply_mount_namespace(
                 * sandbox inside the mount namespace. */
                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,

-                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
+                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
                .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
                .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
                .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@ -3609,7 +3614,7 @@ static int apply_mount_namespace(
                .private_dev = needs_sandboxing && context->private_devices,
                .private_network = needs_sandboxing && exec_needs_network_namespace(context),
                .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
-                .private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
+                .private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
                .private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,

                .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
@ -4220,7 +4225,7 @@ static void log_command_line(
                        LOG_EXEC_INVOCATION_ID(params));
 }

-static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
+static bool exec_context_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
        assert(context);

        return context->private_users != PRIVATE_USERS_NO ||
@ -4239,11 +4244,11 @@ static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
               !strv_isempty(context->extension_directories) ||
               context->protect_system != PROTECT_SYSTEM_NO ||
               context->protect_home != PROTECT_HOME_NO ||
-               exec_needs_pid_namespace(context) ||
+               exec_needs_pid_namespace(context, params) ||
               context->protect_kernel_tunables ||
               context->protect_kernel_modules ||
               context->protect_kernel_logs ||
-               exec_needs_cgroup_mount(context) ||
+               exec_needs_cgroup_mount(context, params) ||
               context->protect_clock ||
               context->protect_hostname != PROTECT_HOSTNAME_NO ||
               !strv_isempty(context->read_write_paths) ||
@ -4284,7 +4289,7 @@ static bool exec_namespace_is_delegated(
        /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
         * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
         * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
-        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
+        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params))
                return false;

        if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
@ -4379,7 +4384,7 @@ static int setup_delegated_namespaces(
                        log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
        }

-        if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) &&
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
                if (unshare(CLONE_NEWCGROUP) < 0) {
                        *reterr_exit_status = EXIT_NAMESPACE;
@ -4391,7 +4396,7 @@ static int setup_delegated_namespaces(

        /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
         * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
-        if (needs_sandboxing && exec_needs_pid_namespace(context) &&
+        if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
                if (params->pidref_transport_fd < 0) {
                        *reterr_exit_status = EXIT_NAMESPACE;
@ -4641,6 +4646,49 @@ static void prepare_terminal(
                (void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
 }

+static int cg_subgroup_attach_pidref(
+                const ExecContext *context,
+                const CGroupContext *cgroup_context,
+                const ExecParameters *params,
+                const char *prefix,
+                const PidRef *pidref,
+                int *reterr_exit_status) {
+
+        _cleanup_free_ char *subgroup = NULL;
+        int r;
+
+        assert(context);
+        assert(cgroup_context);
+        assert(params);
+        assert(reterr_exit_status);
+
+        r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
+        if (r < 0) {
+                *reterr_exit_status = EXIT_CGROUP;
+                return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
+        }
+        if (r == 0)
+                return 0;
+
+        r = cg_attach(subgroup, pidref ? pidref->pid : 0);
+        if (r == -EUCLEAN) {
+                *reterr_exit_status = EXIT_CGROUP;
+                return log_exec_error_errno(context, params, r,
+                                            "Failed to attach process " PID_FMT " to cgroup '%s', "
+                                            "because the cgroup or one of its parents or "
+                                            "siblings is in the threaded mode.",
+                                            pidref ? pidref->pid : getpid_cached(), subgroup);
+        }
+        if (r < 0) {
+                *reterr_exit_status = EXIT_CGROUP;
+                return log_exec_error_errno(context, params, r,
+                                            "Failed to attach process " PID_FMT " to cgroup %s: %m",
+                                            pidref ? pidref->pid : getpid_cached(), subgroup);
+        }
+
+        return 0;
+}
+
 int exec_invoke(
                const ExecCommand *command,
                const ExecContext *context,
@ -4956,28 +5004,37 @@ int exec_invoke(
        if (socket_fd >= 0)
                (void) fd_nonblock(socket_fd, false);

+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
+         * from it. */
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+
        /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
         * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
        if (params->cgroup_path) {
                _cleanup_free_ char *p = NULL;

-                r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
                        return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
                }

-                r = cg_attach(p, 0);
+                /* We cannot spawn the main service process into the subcgroup as it needs to unshare the
+                 * cgroup namespace first if one is configured to make sure the root of the cgroup namespace
+                 * is the service cgroup and not the subcgroup. */
+                const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context, params) ? params->cgroup_path : p;
+
+                r = cg_attach(cgtarget, 0);
                if (r == -EUCLEAN) {
                        *exit_status = EXIT_CGROUP;
                        return log_exec_error_errno(context, params, r,
                                                    "Failed to attach process to cgroup '%s', "
                                                    "because the cgroup or one of its parents or "
-                                                    "siblings is in the threaded mode.", p);
+                                                    "siblings is in the threaded mode.", cgtarget);
                }
                if (r < 0) {
                        *exit_status = EXIT_CGROUP;
-                        return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
+                        return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", cgtarget);
                }
        }

@ -5177,10 +5234,6 @@ int exec_invoke(
                }
        }

-        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
-         * from it. */
-        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
-
        if (params->cgroup_path) {
                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@ -5196,7 +5249,7 @@ int exec_invoke(
                                return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
                        }

-                        r = exec_params_get_cgroup_path(params, cgroup_context, &p);
+                        r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
                        if (r < 0) {
                                *exit_status = EXIT_CGROUP;
                                return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
@ -5228,7 +5281,7 @@ int exec_invoke(
                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
                                 * memory_pressure_path != NULL in the conditional below. */
-                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
+                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
                                        memory_pressure_path = mfree(memory_pressure_path);
                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
                                        if (r < 0) {
@ -5371,12 +5424,20 @@ int exec_invoke(
                 * wins here. (See above.) */

                /* All fds passed in the fds array will be closed in the pam child process. */
-                r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
+                PidRef pam_pidref;
+                r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd, &pam_pidref);
                if (r < 0) {
                        *exit_status = EXIT_PAM;
                        return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
                }

+                if (params->cgroup_path) {
+                        /* Move PAM into subgroup immediately if one is configured. */
+                        r = cg_subgroup_attach_pidref(context, cgroup_context, params, params->cgroup_path, &pam_pidref, exit_status);
+                        if (r < 0)
+                                return r;
+                }
+
                /* PAM modules might have set some ambient caps. Query them here and merge them into
                 * the caps we want to set in the end, so that we don't end up unsetting them. */
                uint64_t ambient_after_pam;
@ -5395,7 +5456,7 @@ int exec_invoke(
                }
        }

-        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
+        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context, params)) {
                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
@ -5498,6 +5559,23 @@ int exec_invoke(
        if (r < 0)
                return r;

+        /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
+         * ensures the root of the cgroup namespace is the top level service cgroup and not the
+         * subcgroup. Don't do this for control processes that are spawned immediately into a
+         * subcgroup, as those are already in the right place. */
+        if (needs_sandboxing && exec_needs_cgroup_namespace(context, params) && params->cgroup_path) {
+                r = cg_subgroup_attach_pidref(
+                                context,
+                                cgroup_context,
+                                params,
+                                /* Adjust the prefix accordingly since we're in a cgroup namespace now. */
+                                /* prefix= */ NULL,
+                                /* pidref= */ NULL,
+                                exit_status);
+                if (r < 0)
+                        return r;
+        }
+
        /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
         * shall execute. */

--- a/src/core/execute.c
+++ b/src/core/execute.c
@ -236,9 +236,15 @@ static bool needs_cgroup_namespace(ProtectControlGroups i) {
        return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
 }

-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context) {
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
        assert(context);

+        /* CGroup namespaces don't really make sense for control processes and can't really be supported as
+         * (some) control processes need to be spawned directly into a subcgroup to avoid violating the
+         * "no inner processes" rule of cgroupv2, so don't do any cgroup namespacing for control processes. */
+        if (params && needs_cgroup_namespace(context->protect_control_groups) && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+                return PROTECT_CONTROL_GROUPS_YES;
+
        /* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
         * use cgroup namespace, we ignore the setting and do not unshare the namespace.
         * ProtectControlGroups=private and strict get downgraded to no and yes respectively. This ensures
@ -252,27 +258,31 @@ ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context)
        return context->protect_control_groups;
 }

-bool exec_needs_cgroup_namespace(const ExecContext *context) {
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
        assert(context);

-        return needs_cgroup_namespace(exec_get_protect_control_groups(context));
+        return needs_cgroup_namespace(exec_get_protect_control_groups(context, params));
 }

-bool exec_needs_cgroup_mount(const ExecContext *context) {
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
        assert(context);

-        return exec_get_protect_control_groups(context) != PROTECT_CONTROL_GROUPS_NO;
+        return exec_get_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
 }

-bool exec_is_cgroup_mount_read_only(const ExecContext *context) {
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
        assert(context);

-        return IN_SET(exec_get_protect_control_groups(context), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
+        return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
 }

-bool exec_needs_pid_namespace(const ExecContext *context) {
+bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params) {
        assert(context);

+        /* PID namespaces don't really make sense for control processes so let's not use them for those. */
+        if (params && FLAGS_SET(params->flags, EXEC_IS_CONTROL))
+                return false;
+
        return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
 }

@ -325,11 +335,11 @@ bool exec_needs_mount_namespace(
            context->protect_kernel_tunables ||
            context->protect_kernel_modules ||
            context->protect_kernel_logs ||
-            exec_needs_cgroup_mount(context) ||
+            exec_needs_cgroup_mount(context, params) ||
            context->protect_proc != PROTECT_PROC_DEFAULT ||
            context->proc_subset != PROC_SUBSET_ALL ||
            exec_needs_ipc_namespace(context) ||
-            exec_needs_pid_namespace(context))
+            exec_needs_pid_namespace(context, params))
                return true;

        if (context->root_directory) {
@ -399,20 +409,25 @@ bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType typ
        return true;
 }

+static int exec_params_needs_control_subcgroup(const ExecParameters *params) {
+        return FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) &&
+                        FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) &&
+                        FLAGS_SET(params->flags, EXEC_IS_CONTROL);
+}
+
 int exec_params_get_cgroup_path(
                const ExecParameters *params,
                const CGroupContext *c,
+                const char *prefix,
                char **ret) {

        const char *subgroup = NULL;
        char *p;

        assert(params);
+        assert(c);
        assert(ret);

-        if (!params->cgroup_path)
-                return -EINVAL;
-
        /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
         * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
         * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
@ -430,9 +445,9 @@ int exec_params_get_cgroup_path(
        }

        if (subgroup)
-                p = path_join(params->cgroup_path, subgroup);
+                p = path_join(prefix, subgroup);
        else
-                p = strdup(params->cgroup_path);
+                p = strdup(strempty(prefix));
        if (!p)
                return -ENOMEM;

@ -501,7 +516,7 @@ int exec_spawn(
        log_command_line(unit, "About to execute", command->path, command->argv);

        if (params->cgroup_path) {
-                r = exec_params_get_cgroup_path(params, cgroup_context, &subcgroup_path);
+                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup_path);
                if (r < 0)
                        return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
                if (r > 0) {
@ -514,6 +529,13 @@ int exec_spawn(
                }
        }

+        /* We cannot spawn the main service process into the subcgroup as it might need to unshare the cgroup
+         * namespace first if one is configured to make sure the root of the cgroup namespace is the service
+         * cgroup and not the subcgroup. However, when running control commands on a live service, the
+         * commands have to be spawned inside a subcgroup, otherwise we violate the no inner processes rule
+         * of cgroupv2. */
+        const char *cgtarget = exec_params_needs_control_subcgroup(params) ? subcgroup_path : params->cgroup_path;
+
        /* In order to avoid copy-on-write traps and OOM-kills when pid1's memory.current is above the
         * child's memory.max, serialize all the state needed to start the unit, and pass it to the
         * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
@ -576,24 +598,24 @@ int exec_spawn(
                                  "--log-level", max_log_levels,
                                  "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
                        environ,
-                        subcgroup_path,
+                        cgtarget,
                        &pidref);

        /* Drop the ambient set again, so no processes other than sd-executore spawned from the manager inherit it. */
        (void) capability_ambient_set_apply(0, /* also_inherit= */ false);

-        if (r == -EUCLEAN && subcgroup_path)
+        if (r == -EUCLEAN && cgtarget)
                return log_unit_error_errno(unit, r,
                                            "Failed to spawn process into cgroup '%s', because the cgroup "
                                            "or one of its parents or siblings is in the threaded mode.",
-                                            subcgroup_path);
+                                            cgtarget);
        if (r < 0)
                return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
        /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
         * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
         * process will be killed too). */
-        if (r == 0 && subcgroup_path)
-                (void) cg_attach(subcgroup_path, pidref.pid);
+        if (r == 0 && cgtarget)
+                (void) cg_attach(cgtarget, pidref.pid);
        /* r > 0: Already in the right cgroup thanks to CLONE_INTO_CGROUP */

        log_unit_debug(unit, "Forked %s as " PID_FMT " (%s CLONE_INTO_CGROUP)",
--- a/src/core/execute.h
+++ b/src/core/execute.h
@ -585,7 +585,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
 void exec_runtime_clear(ExecRuntime *rt);

-int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, char **ret);
+int exec_params_get_cgroup_path(const ExecParameters *params, const CGroupContext *c, const char *prefix, char **ret);
 void exec_params_shallow_clear(ExecParameters *p);
 void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix);
 void exec_params_deep_clear(ExecParameters *p);
@ -629,12 +629,12 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
 bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
 bool exec_needs_network_namespace(const ExecContext *context);
 bool exec_needs_ipc_namespace(const ExecContext *context);
-bool exec_needs_pid_namespace(const ExecContext *context);
+bool exec_needs_pid_namespace(const ExecContext *context, const ExecParameters *params);

-ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context);
-bool exec_needs_cgroup_namespace(const ExecContext *context);
-bool exec_needs_cgroup_mount(const ExecContext *context);
-bool exec_is_cgroup_mount_read_only(const ExecContext *context);
+ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
+bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
+bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);

 const char* exec_get_private_notify_socket_path(const ExecContext *context, const ExecParameters *params, bool needs_sandboxing);

--- a/src/core/service.c
+++ b/src/core/service.c
@ -711,7 +711,7 @@ static int service_verify(Service *s) {
        if (s->type == SERVICE_DBUS && !s->bus_name)
                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");

-        if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
+        if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context, /* params= */ NULL))
                return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");

        if (s->usb_function_descriptors && !s->usb_function_strings)
--- a/src/core/unit.c
+++ b/src/core/unit.c
@ -4181,7 +4181,7 @@ static int unit_verify_contexts(const Unit *u) {
            exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
                return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");

-        if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
+        if (exec_needs_pid_namespace(ec, /* params= */ NULL) && !UNIT_VTABLE(u)->notify_pidref)
                return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");

        const KillContext *kc = unit_get_kill_context(u);
--- a/test/units/TEST-07-PID1.protect-control-groups.sh
+++ b/test/units/TEST-07-PID1.protect-control-groups.sh
@ -104,4 +104,32 @@ testcase_basic_strict() {
    test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
 }

+testcase_delegate_subgroup() {
+    # Make sure the service cgroup is the root of the cgroup namespace when we use DelegateSubgroup.
+    systemd-run \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        --wait \
+        --pipe \
+        ls /sys/fs/cgroup/supervisor
+}
+
+testcase_delegate_subgroup_control() {
+    # Make sure control processes are not namespaced and are still put in the .control cgroup.
+    assert_eq "$(
+        systemd-run \
+        -p ProtectControlGroupsEx=private \
+        -p PrivateMounts=yes \
+        -p Delegate=yes \
+        -p DelegateSubgroup=supervisor \
+        -p ExecStartPre="cat /proc/self/cgroup" \
+        --unit delegate-subgroup-control \
+        --wait \
+        --pipe \
+        true
+    )" "0::/system.slice/delegate-subgroup-control.service/.control"
+}
+
 run_testcases