1
0
mirror of https://github.com/systemd/systemd synced 2025-11-17 15:54:45 +01:00

Compare commits

..

No commits in common. "404d9bfb62a2ce14939b194e1f9f03e9dc537ae5" and "96d03f8e41dc7b59cc48811da65e2dc88b60e8f5" have entirely different histories.

3 changed files with 58 additions and 179 deletions

5
NEWS
View File

@ -2,11 +2,6 @@ systemd System and Service Manager
CHANGES WITH 259 in spe: CHANGES WITH 259 in spe:
* The cgroup2 file system is now mounted with the
"memory_hugetlb_accounting" mount option, supported since kernel 6.6.
This means that HugeTLB memory usage is now counted towards the
cgroups overall memory usage for the memory controller.
* homectl's --recovery-key= option may now be used with the "update" * homectl's --recovery-key= option may now be used with the "update"
command to add recovery keys to existing user accounts. Previously, command to add recovery keys to existing user accounts. Previously,
recovery keys could only be configured during initial user creation. recovery keys could only be configured during initial user creation.

3
README
View File

@ -68,8 +68,7 @@ REQUIREMENTS:
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH and MOVE_MOUNT_BENEATH
≥ 6.6 for quota support on tmpfs and cgroup2fs memory_hugetlb_accounting ≥ 6.6 for quota support on tmpfs
option
≥ 6.9 for pidfs ≥ 6.9 for pidfs
≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH), ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH),
and block device 'partscan' sysfs attribute and block device 'partscan' sysfs attribute

View File

@ -25,10 +25,12 @@
#include "virt.h" #include "virt.h"
typedef enum MountMode { typedef enum MountMode {
MNT_NONE = 0,
MNT_FATAL = 1 << 0, MNT_FATAL = 1 << 0,
MNT_IN_CONTAINER = 1 << 1, MNT_IN_CONTAINER = 1 << 1,
MNT_CHECK_WRITABLE = 1 << 2, MNT_CHECK_WRITABLE = 1 << 2,
MNT_FOLLOW_SYMLINK = 1 << 3, MNT_FOLLOW_SYMLINK = 1 << 3,
MNT_USRQUOTA_GRACEFUL = 1 << 4,
} MountMode; } MountMode;
typedef struct MountPoint { typedef struct MountPoint {
@ -36,198 +38,79 @@ typedef struct MountPoint {
const char *where; const char *where;
const char *type; const char *type;
const char *options; const char *options;
int (*options_fn)(int priority, const char *type, char **ret);
unsigned long flags; unsigned long flags;
MountMode mode; MountMode mode;
bool (*condition_fn)(void); bool (*condition_fn)(void);
} MountPoint; } MountPoint;
static int cgroupfs_mount_options(int priority, const char *type, char **ret) { static bool cgroupfs_recursiveprot_supported(void) {
int r; int r;
assert(type); /* Added in kernel 5.7 */
assert(streq(type, "cgroup2"));
assert(ret);
_cleanup_free_ char *opts = NULL; r = mount_option_supported("cgroup2", "memory_recursiveprot", /* value = */ NULL);
FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") {
r = mount_option_supported("cgroup2", o, /* value = */ NULL);
if (r < 0) if (r < 0)
log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o); log_debug_errno(r, "Failed to determine whether cgroupfs supports 'memory_recursiveprot' mount option, assuming not: %m");
else if (r == 0) else if (r == 0)
log_debug("'%s' not supported by cgroupfs, not using mount option.", o); log_debug("'memory_recursiveprot' not supported by cgroupfs, not using mount option.");
else if (!strextend_with_separator(&opts, ",", o))
return log_oom_full(priority);
}
*ret = TAKE_PTR(opts); return r > 0;
return 0;
} }
int mount_cgroupfs(const char *path) { int mount_cgroupfs(const char *path) {
int r;
assert(path); assert(path);
/* Mount a separate cgroupfs instance, taking all options we initial set into account. This is /* Mount a separate cgroupfs instance, taking all options we initial set into account. This is
* especially useful when cgroup namespace is *not* employed, since the kernel overrides all * especially useful when cgroup namespace is *not* employed, since the kernel overrides all
* previous options if a new mount is established in initial cgns (c.f. * previous options if a new mount is established in initial cgns (c.f.
* https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984) * https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984)
*/ *
* The options shall be kept in sync with those in mount_table below. */
_cleanup_free_ char *opts = NULL; return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2",
r = cgroupfs_mount_options(LOG_WARNING, "cgroup2", &opts); MS_NOSUID|MS_NOEXEC|MS_NODEV,
if (r < 0) cgroupfs_recursiveprot_supported() ? "nsdelegate,memory_recursiveprot" : "nsdelegate");
return r;
/* These options shall be kept in sync with those in mount_table below. */
if (!strprepend_with_separator(&opts, ",", "nsdelegate"))
return log_oom();
return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
}
static int usrquota_mount_option(int priority, const char *type, char **ret) {
_cleanup_free_ char *o = NULL;
int r;
assert(type);
assert(ret);
r = mount_option_supported(type, "usrquota", /* value= */ NULL);
if (r < 0)
log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", type);
else if (r == 0)
log_debug("Not enabling 'usrquota' for '%s' as kernel lacks support for it.", type);
else {
o = strdup("usrquota");
if (!o)
return log_oom_full(priority);
}
*ret = TAKE_PTR(o);
return 0;
} }
static const MountPoint mount_table[] = { static const MountPoint mount_table[] = {
{ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
.what = "proc", MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK },
.where = "/proc", { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
.type = "proc", MNT_FATAL|MNT_IN_CONTAINER },
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV, { "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK, MNT_FATAL|MNT_IN_CONTAINER },
}, { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ MNT_NONE },
.what = "sysfs",
.where = "/sys",
.type = "sysfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "devtmpfs",
.where = "/dev",
.type = "devtmpfs",
.options = "mode=0755" TMPFS_LIMITS_DEV,
.flags = MS_NOSUID|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "securityfs",
.where = "/sys/kernel/security",
.type = "securityfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
#if ENABLE_SMACK #if ENABLE_SMACK
{ { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
.what = "smackfs", MNT_FATAL, mac_smack_use },
.where = "/sys/fs/smackfs", { "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.type = "smackfs", MNT_FATAL|MNT_USRQUOTA_GRACEFUL, mac_smack_use },
.options = "smackfsdef=*",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
{
.what = "tmpfs",
.where = "/dev/shm",
.type = "tmpfs",
.options = "mode=01777,smackfsroot=*",
.options_fn = usrquota_mount_option,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
#endif #endif
{ { "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.what = "tmpfs", MNT_FATAL|MNT_IN_CONTAINER|MNT_USRQUOTA_GRACEFUL },
.where = "/dev/shm", { "devpts", "/dev/pts", "devpts", "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
.type = "tmpfs", MNT_IN_CONTAINER },
.options = "mode=01777",
.options_fn = usrquota_mount_option,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "devpts",
.where = "/dev/pts",
.type = "devpts",
.options = "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID),
.flags = MS_NOSUID|MS_NOEXEC,
.mode = MNT_IN_CONTAINER,
},
#if ENABLE_SMACK #if ENABLE_SMACK
{ { "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.what = "tmpfs", MNT_FATAL, mac_smack_use },
.where = "/run",
.type = "tmpfs",
.options = "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
#endif #endif
{ { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.what = "tmpfs", MNT_FATAL|MNT_IN_CONTAINER },
.where = "/run", { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV,
.type = "tmpfs", MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, cgroupfs_recursiveprot_supported },
.options = "mode=0755" TMPFS_LIMITS_RUN, { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME, MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "cgroup2",
.where = "/sys/fs/cgroup",
.type = "cgroup2",
.options = "nsdelegate",
.options_fn = cgroupfs_mount_options,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE,
},
#if ENABLE_PSTORE #if ENABLE_PSTORE
{ { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
.what = "pstore", MNT_NONE },
.where = "/sys/fs/pstore",
.type = "pstore",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
#endif #endif
#if ENABLE_EFI #if ENABLE_EFI
{ { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
.what = "efivarfs", MNT_NONE, is_efi_boot },
.where = "/sys/firmware/efi/efivars",
.type = "efivarfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.condition_fn = is_efi_boot,
},
#endif #endif
{ { "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
.what = "bpf", MNT_NONE },
.where = "/sys/fs/bpf",
.type = "bpf",
.options = "mode=0700",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
}; };
/* The first three entries we might need before SELinux is up. The /* The first three entries we might need before SELinux is up. The
@ -310,18 +193,20 @@ static int mount_one(const MountPoint *p, bool relabel) {
(void) mkdir_p(p->where, 0755); (void) mkdir_p(p->where, 0755);
_cleanup_free_ char *extend_options = NULL; _cleanup_free_ char *extend_options = NULL;
const char *o; const char *o = p->options;
if (p->options_fn) { if (FLAGS_SET(p->mode, MNT_USRQUOTA_GRACEFUL)) {
r = p->options_fn(priority, p->type, &extend_options); r = mount_option_supported(p->type, "usrquota", /* value= */ NULL);
if (r < 0) if (r < 0)
return r; log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", p->type);
else if (r == 0)
if (!strprepend_with_separator(&extend_options, ",", p->options)) log_debug("Not enabling 'usrquota' on '%s' as kernel lacks support for it.", p->where);
else {
if (!strextend_with_separator(&extend_options, ",", p->options ?: POINTER_MAX, "usrquota"))
return log_oom(); return log_oom();
o = extend_options; o = extend_options;
} else }
o = p->options; }
r = mount_verbose_full(priority, p->what, p->where, p->type, p->flags, o, FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK)); r = mount_verbose_full(priority, p->what, p->where, p->type, p->flags, o, FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK));
if (r < 0) if (r < 0)