Compare commits

...

2 Commits

Author SHA1 Message Date
Ryan Wilson efe460b0d1
Merge bf05e30acc into 5bed97dd57 2024-11-26 13:51:29 +00:00
Ryan Wilson bf05e30acc core: Add PrivateUsers=full
Recently, PrivateUsers=identity was added to support mapping the first
65536 UIDs/GIDs from parent to the child namespace and mapping the other
UID/GIDs to the nobody user.

However, there are use cases where users have UIDs/GIDs > 65536 and need
to do a similar identity mapping. Moreover, in some of those cases, users
want a full identity mapping from 0 -> UID_MAX.

To support this, we add PrivateUsers=full that does identity mapping for
all available UID/GIDs.

Note to differentiate ourselves from the init user namespace, we need to
set up the uid_map/gid_map like:
```
0 0 1
1 1 UINT32_MAX - 1
```

as the init user namedspace uses `0 0 UINT32_MAX` and some applications -
like systemd itself - determine if its a non-init user namespace based on
uid_map/gid_map files.

Fixes: #35168
2024-11-15 12:30:51 -08:00
5 changed files with 32 additions and 2 deletions

View File

@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
<varlistentry>
<term><varname>PrivateUsers=</varname></term>
<listitem><para>Takes a boolean argument or one of <literal>self</literal> or
<literal>identity</literal>. Defaults to false. If enabled, sets up a new user namespace for the
<listitem><para>Takes a boolean argument or one of <literal>self</literal>, <literal>identity</literal>,
or <literal>full</literal>. Defaults to false. If enabled, sets up a new user namespace for the
executed processes and configures a user and group mapping. If set to a true value or
<literal>self</literal>, a minimal user and group mapping is configured that maps the
<literal>root</literal> user and group as well as the unit's own user and group to themselves and
@ -2026,6 +2026,10 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is
often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
<para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
mapping for all UIDs/GIDs. Similar to <literal>identity</literal>, this does not provide UID/GID
isolation, but it does provide process capability isolation.</para>
<para>If this mode is enabled, all unit processes are run without privileges in the host user
namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically
this means that the process will have zero process capabilities on the host's user namespace, but

View File

@ -2103,6 +2103,23 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
uid_map = strdup("0 0 65536\n");
if (!uid_map)
return -ENOMEM;
} else if (private_users == PRIVATE_USERS_FULL) {
/* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
* this is the same UID/GID map as the init user namespace and there are various applications
* (i.e. systemd's running_in_userns()) that check whether they are in a user namespace by
* comparing uid_map/gid_map to `0 0 UINT32_MAX`. Thus, we still map all UIDs/GIDs but do it
* using two extents to differentiate the new user namespace from the init namespace:
* 0 0 1
* 1 1 UINT32_MAX - 1
*
* Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
* the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
* icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
* use these UIDs/GIDs so we need to map them. */
r = asprintf(&uid_map, "0 0 1\n"
"1 1 " UID_FMT "\n", UINT32_MAX - 1);
if (r < 0)
return -ENOMEM;
/* Can only set up multiple mappings with CAP_SETUID. */
} else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
r = asprintf(&uid_map,
@ -2123,6 +2140,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
gid_map = strdup("0 0 65536\n");
if (!gid_map)
return -ENOMEM;
} else if (private_users == PRIVATE_USERS_FULL) {
r = asprintf(&gid_map, "0 0 1\n"
"1 1 " UID_FMT "\n", UINT32_MAX - 1);
if (r < 0)
return -ENOMEM;
/* Can only set up multiple mappings with CAP_SETGID. */
} else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
r = asprintf(&gid_map,

View File

@ -3364,6 +3364,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
[PRIVATE_USERS_NO] = "no",
[PRIVATE_USERS_SELF] = "self",
[PRIVATE_USERS_IDENTITY] = "identity",
[PRIVATE_USERS_FULL] = "full",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);

View File

@ -65,6 +65,7 @@ typedef enum PrivateUsers {
PRIVATE_USERS_NO,
PRIVATE_USERS_SELF,
PRIVATE_USERS_IDENTITY,
PRIVATE_USERS_FULL,
_PRIVATE_USERS_MAX,
_PRIVATE_USERS_INVALID = -EINVAL,
} PrivateUsers;

View File

@ -10,3 +10,5 @@ systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_ma
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"'
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'