1
0
mirror of https://github.com/systemd/systemd synced 2025-11-17 15:54:45 +01:00

Compare commits

..

4 Commits

Author SHA1 Message Date
Yu Watanabe
404d9bfb62
mount-setup: Add memory_hugetlb_accounting to cgroupfs mount (#39486)
This mount option will count HugeTLB memory usage towards the cgroup’s
overall memory usage for the memory controller.

See
https://lore.kernel.org/all/20231006184629.155543-4-nphamcs@gmail.com/T/#u
for the patch introducing the new mount option.
2025-10-31 15:20:10 +09:00
Daan De Meyer
886c078702 mount-setup: Add memory_hugetlb_accounting to cgroupfs mount
This mount option will count HugeTLB memory usage towards the cgroup’s
overall memory usage for the memory controller.

See https://lore.kernel.org/all/20231006184629.155543-4-nphamcs@gmail.com/T/#u
for the patch introducing the new mount option.
2025-10-30 22:28:41 +01:00
Daan De Meyer
5ce388aec8 mount-setup: Add optional function which provides extra mount options 2025-10-30 22:28:39 +01:00
Daan De Meyer
c46344d597 mount-setup: Reformat table
Preparation for the next commit.
2025-10-30 19:59:29 +01:00
3 changed files with 179 additions and 58 deletions

5
NEWS
View File

@ -2,6 +2,11 @@ systemd System and Service Manager
CHANGES WITH 259 in spe: CHANGES WITH 259 in spe:
* The cgroup2 file system is now mounted with the
"memory_hugetlb_accounting" mount option, supported since kernel 6.6.
This means that HugeTLB memory usage is now counted towards the
cgroups overall memory usage for the memory controller.
* homectl's --recovery-key= option may now be used with the "update" * homectl's --recovery-key= option may now be used with the "update"
command to add recovery keys to existing user accounts. Previously, command to add recovery keys to existing user accounts. Previously,
recovery keys could only be configured during initial user creation. recovery keys could only be configured during initial user creation.

3
README
View File

@ -68,7 +68,8 @@ REQUIREMENTS:
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD, ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH and MOVE_MOUNT_BENEATH
≥ 6.6 for quota support on tmpfs ≥ 6.6 for quota support on tmpfs and cgroup2fs memory_hugetlb_accounting
option
≥ 6.9 for pidfs ≥ 6.9 for pidfs
≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH), ≥ 6.10 for fcntl(F_DUPFD_QUERY), unprivileged linkat(AT_EMPTY_PATH),
and block device 'partscan' sysfs attribute and block device 'partscan' sysfs attribute

View File

@ -25,12 +25,10 @@
#include "virt.h" #include "virt.h"
typedef enum MountMode { typedef enum MountMode {
MNT_NONE = 0,
MNT_FATAL = 1 << 0, MNT_FATAL = 1 << 0,
MNT_IN_CONTAINER = 1 << 1, MNT_IN_CONTAINER = 1 << 1,
MNT_CHECK_WRITABLE = 1 << 2, MNT_CHECK_WRITABLE = 1 << 2,
MNT_FOLLOW_SYMLINK = 1 << 3, MNT_FOLLOW_SYMLINK = 1 << 3,
MNT_USRQUOTA_GRACEFUL = 1 << 4,
} MountMode; } MountMode;
typedef struct MountPoint { typedef struct MountPoint {
@ -38,79 +36,198 @@ typedef struct MountPoint {
const char *where; const char *where;
const char *type; const char *type;
const char *options; const char *options;
int (*options_fn)(int priority, const char *type, char **ret);
unsigned long flags; unsigned long flags;
MountMode mode; MountMode mode;
bool (*condition_fn)(void); bool (*condition_fn)(void);
} MountPoint; } MountPoint;
static bool cgroupfs_recursiveprot_supported(void) { static int cgroupfs_mount_options(int priority, const char *type, char **ret) {
int r; int r;
/* Added in kernel 5.7 */ assert(type);
assert(streq(type, "cgroup2"));
assert(ret);
r = mount_option_supported("cgroup2", "memory_recursiveprot", /* value = */ NULL); _cleanup_free_ char *opts = NULL;
FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") {
r = mount_option_supported("cgroup2", o, /* value = */ NULL);
if (r < 0) if (r < 0)
log_debug_errno(r, "Failed to determine whether cgroupfs supports 'memory_recursiveprot' mount option, assuming not: %m"); log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o);
else if (r == 0) else if (r == 0)
log_debug("'memory_recursiveprot' not supported by cgroupfs, not using mount option."); log_debug("'%s' not supported by cgroupfs, not using mount option.", o);
else if (!strextend_with_separator(&opts, ",", o))
return log_oom_full(priority);
}
return r > 0; *ret = TAKE_PTR(opts);
return 0;
} }
int mount_cgroupfs(const char *path) { int mount_cgroupfs(const char *path) {
int r;
assert(path); assert(path);
/* Mount a separate cgroupfs instance, taking all options we initial set into account. This is /* Mount a separate cgroupfs instance, taking all options we initial set into account. This is
* especially useful when cgroup namespace is *not* employed, since the kernel overrides all * especially useful when cgroup namespace is *not* employed, since the kernel overrides all
* previous options if a new mount is established in initial cgns (c.f. * previous options if a new mount is established in initial cgns (c.f.
* https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984) * https://github.com/torvalds/linux/blob/b69bb476dee99d564d65d418e9a20acca6f32c3f/kernel/cgroup/cgroup.c#L1984)
* */
* The options shall be kept in sync with those in mount_table below. */
return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", _cleanup_free_ char *opts = NULL;
MS_NOSUID|MS_NOEXEC|MS_NODEV, r = cgroupfs_mount_options(LOG_WARNING, "cgroup2", &opts);
cgroupfs_recursiveprot_supported() ? "nsdelegate,memory_recursiveprot" : "nsdelegate"); if (r < 0)
return r;
/* These options shall be kept in sync with those in mount_table below. */
if (!strprepend_with_separator(&opts, ",", "nsdelegate"))
return log_oom();
return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
}
static int usrquota_mount_option(int priority, const char *type, char **ret) {
_cleanup_free_ char *o = NULL;
int r;
assert(type);
assert(ret);
r = mount_option_supported(type, "usrquota", /* value= */ NULL);
if (r < 0)
log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", type);
else if (r == 0)
log_debug("Not enabling 'usrquota' for '%s' as kernel lacks support for it.", type);
else {
o = strdup("usrquota");
if (!o)
return log_oom_full(priority);
}
*ret = TAKE_PTR(o);
return 0;
} }
static const MountPoint mount_table[] = { static const MountPoint mount_table[] = {
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, {
MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK }, .what = "proc",
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, .where = "/proc",
MNT_FATAL|MNT_IN_CONTAINER }, .type = "proc",
{ "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME, .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
MNT_FATAL|MNT_IN_CONTAINER }, .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK,
{ "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, },
MNT_NONE }, {
.what = "sysfs",
.where = "/sys",
.type = "sysfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "devtmpfs",
.where = "/dev",
.type = "devtmpfs",
.options = "mode=0755" TMPFS_LIMITS_DEV,
.flags = MS_NOSUID|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "securityfs",
.where = "/sys/kernel/security",
.type = "securityfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
#if ENABLE_SMACK #if ENABLE_SMACK
{ "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, {
MNT_FATAL, mac_smack_use }, .what = "smackfs",
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, .where = "/sys/fs/smackfs",
MNT_FATAL|MNT_USRQUOTA_GRACEFUL, mac_smack_use }, .type = "smackfs",
.options = "smackfsdef=*",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
{
.what = "tmpfs",
.where = "/dev/shm",
.type = "tmpfs",
.options = "mode=01777,smackfsroot=*",
.options_fn = usrquota_mount_option,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
#endif #endif
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, {
MNT_FATAL|MNT_IN_CONTAINER|MNT_USRQUOTA_GRACEFUL }, .what = "tmpfs",
{ "devpts", "/dev/pts", "devpts", "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, .where = "/dev/shm",
MNT_IN_CONTAINER }, .type = "tmpfs",
.options = "mode=01777",
.options_fn = usrquota_mount_option,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "devpts",
.where = "/dev/pts",
.type = "devpts",
.options = "mode=" STRINGIFY(TTY_MODE) ",gid=" STRINGIFY(TTY_GID),
.flags = MS_NOSUID|MS_NOEXEC,
.mode = MNT_IN_CONTAINER,
},
#if ENABLE_SMACK #if ENABLE_SMACK
{ "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, {
MNT_FATAL, mac_smack_use }, .what = "tmpfs",
.where = "/run",
.type = "tmpfs",
.options = "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN,
.flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL,
.condition_fn = mac_smack_use,
},
#endif #endif
{ "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, {
MNT_FATAL|MNT_IN_CONTAINER }, .what = "tmpfs",
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV, .where = "/run",
MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE, cgroupfs_recursiveprot_supported }, .type = "tmpfs",
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, .options = "mode=0755" TMPFS_LIMITS_RUN,
MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME,
.mode = MNT_FATAL|MNT_IN_CONTAINER,
},
{
.what = "cgroup2",
.where = "/sys/fs/cgroup",
.type = "cgroup2",
.options = "nsdelegate",
.options_fn = cgroupfs_mount_options,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE,
},
#if ENABLE_PSTORE #if ENABLE_PSTORE
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, {
MNT_NONE }, .what = "pstore",
.where = "/sys/fs/pstore",
.type = "pstore",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
#endif #endif
#if ENABLE_EFI #if ENABLE_EFI
{ "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, {
MNT_NONE, is_efi_boot }, .what = "efivarfs",
.where = "/sys/firmware/efi/efivars",
.type = "efivarfs",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.condition_fn = is_efi_boot,
},
#endif #endif
{ "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV, {
MNT_NONE }, .what = "bpf",
.where = "/sys/fs/bpf",
.type = "bpf",
.options = "mode=0700",
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
},
}; };
/* The first three entries we might need before SELinux is up. The /* The first three entries we might need before SELinux is up. The
@ -193,20 +310,18 @@ static int mount_one(const MountPoint *p, bool relabel) {
(void) mkdir_p(p->where, 0755); (void) mkdir_p(p->where, 0755);
_cleanup_free_ char *extend_options = NULL; _cleanup_free_ char *extend_options = NULL;
const char *o = p->options; const char *o;
if (FLAGS_SET(p->mode, MNT_USRQUOTA_GRACEFUL)) { if (p->options_fn) {
r = mount_option_supported(p->type, "usrquota", /* value= */ NULL); r = p->options_fn(priority, p->type, &extend_options);
if (r < 0) if (r < 0)
log_full_errno(priority, r, "Unable to determine whether %s supports 'usrquota' mount option, assuming not: %m", p->type); return r;
else if (r == 0)
log_debug("Not enabling 'usrquota' on '%s' as kernel lacks support for it.", p->where); if (!strprepend_with_separator(&extend_options, ",", p->options))
else {
if (!strextend_with_separator(&extend_options, ",", p->options ?: POINTER_MAX, "usrquota"))
return log_oom(); return log_oom();
o = extend_options; o = extend_options;
} } else
} o = p->options;
r = mount_verbose_full(priority, p->what, p->where, p->type, p->flags, o, FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK)); r = mount_verbose_full(priority, p->what, p->where, p->type, p->flags, o, FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK));
if (r < 0) if (r < 0)