1
0
mirror of https://github.com/systemd/systemd synced 2026-03-18 11:04:46 +01:00

Compare commits

...

12 Commits

Author SHA1 Message Date
Yu Watanabe
49eb2d50b4
Bump kernel requirements to >= 5.10, and recommend >= 5.14 (#38977)
Then, this drops several unnecessary code for older kernels.
2026-01-13 12:18:15 +09:00
calm329
ac2b5f6cbf basic: move BPF_JMP_A to override/linux/bpf_insn.h
Move the BPF_JMP_A macro from override/linux/bpf.h to
override/linux/bpf_insn.h. The bpf.h override conflicts with libbpf's
-I/usr/include/bpf/uapi include path. Since bpf_insn.h is not typically
installed at /usr/include/linux/ or /usr/include/bpf/uapi/linux/, the
override works without conflicts.

Fixes #40331
2026-01-13 12:14:11 +09:00
Yu Watanabe
fce7ad481b process-util: drop unnecessary E2BIG error handling
E2BIG is returned when the kernel does not support CLONE_INTO_CGROUP,
but the flag is supported since kernel v5.7.
2026-01-13 10:21:36 +09:00
Yu Watanabe
80c5b63db5 mount-setup: memory_recursiveprot is supported since kernel v5.7
Our baseline on kernel is 5.10. Hence we can unconditionally use it.
2026-01-13 10:21:36 +09:00
Yu Watanabe
d58c826c54 fd-util: drop close_all_fds_by_proc()
With the previous commit, now the function is used only on OOM.
In that case, let's do like close_all_fds_without_malloc().
2026-01-13 10:21:36 +09:00
Yu Watanabe
c471cca093 fd-util: close_range() is available since kernel 5.9
Our baseline on kernel is 5.10, hence we can always use it.
2026-01-13 10:21:36 +09:00
Yu Watanabe
d377cc5244 stat-util: STATX_TYPE and STATX_INO should be always set 2026-01-13 10:21:05 +09:00
Yu Watanabe
7ecfa87c56 fd-util,mountpoint-util: STATX_MNT_ID is supported since kernel 5.10
Our baseline on kernel is 5.10, hence we can assume it works.
2026-01-13 10:21:05 +09:00
Yu Watanabe
7f6cf19513 mountpoint-util: make is_mount_point_at() take usual dir_fd + path style arguments 2026-01-13 10:21:05 +09:00
Yu Watanabe
a98a6eb95c tree-wide: statx() supports STATX_ATTR_MOUNT_ROOT since kernel 5.8
Our baseline on kernel is 5.10, hence we can unconditionally use it.
2026-01-13 10:21:04 +09:00
Yu Watanabe
ec21e5c58e random-util: assume getrandom(GRND_INSECURE) works
GRND_INSECURE was added in kernel 5.6, and our baseline on kernel is
5.10. Let's assume it always works. Even if it does not work, we have
further fallback logics. So, this should be safe.
2026-01-13 10:21:04 +09:00
Yu Watanabe
1aeba33d8f Bump required minimum kernel version to 5.10 and the recommended baseline to 5.14.
The previous minimum required version 5.4 will be EOL on 2025-12.
Let's bump the required minimum kernel version to the next LTS release 5.10
(released on 2020-12-13, EOL on 2026-12, CIP support until 2031-01).

The new recommended baseline 5.14 is the version that CentOS 9 uses.
CentOS 9 will EOL on 2027-05.

See also #38608.
2026-01-13 10:21:04 +09:00
20 changed files with 160 additions and 466 deletions

24
README
View File

@ -47,25 +47,25 @@ REQUIREMENTS:
≥ 5.3 for bounded loops in BPF program, keyring namespacing,
and nexthop support
≥ 5.4 for pidfd and signed Verity images
⛔ Kernel versions below 5.4 ("minimum baseline") are not supported at all,
and are missing required functionality as listed above.
Linux kernel ≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.7 for CLONE_INTO_CGROUP, cgroup2fs memory_recursiveprot option,
BPF links and the BPF LSM hook
≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
≥ 5.9 for close_range()
≥ 5.10 for STATX_MNT_ID
⚠️ Kernel versions below 5.7 ("recommended baseline") have significant gaps
⛔ Kernel versions below 5.10 ("minimum baseline") are not supported at all,
and are missing required functionality as listed above.
Linux kernel ≥ 5.12 for idmapped mount
≥ 5.14 for cgroup.kill and quotactl_fd()
⚠️ Kernel versions below 5.14 ("recommended baseline") have significant gaps
in functionality and are not recommended for use with this version
of systemd. Taint flag 'old-kernel' will be set. systemd will most likely
still function, but upstream support and testing are limited.
Linux kernel ≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
≥ 5.9 for close_range()
≥ 5.12 for idmapped mount
≥ 5.14 for cgroup.kill
≥ 5.14 for quotactl_fd()
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
Linux kernel ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH
≥ 6.6 for quota support on tmpfs

View File

@ -68,4 +68,4 @@
#define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine"
/* Recommended baseline - see README for details */
#define KERNEL_BASELINE_VERSION "5.7"
#define KERNEL_BASELINE_VERSION "5.14"

View File

@ -16,7 +16,6 @@
#include "format-util.h"
#include "fs-util.h"
#include "log.h"
#include "mountpoint-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@ -252,10 +251,9 @@ int close_all_fds_frugal(const int except[], size_t n_except) {
assert(except || n_except == 0);
/* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
* and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
* but when we assume we are called from signal handler context, then use this simpler call
* instead. */
/* This is the inner fallback core of close_all_fds(). This never calls malloc() or so and hence is
* safe to be called in signal handler context. Most users should call close_all_fds(), but when we
* assume we are called from signal handler context, then use this simpler call instead. */
max_fd = get_max_fd();
if (max_fd < 0)
@ -281,44 +279,6 @@ int close_all_fds_frugal(const int except[], size_t n_except) {
return r;
}
int close_all_fds_by_proc(const int except[], size_t n_except) {
_cleanup_closedir_ DIR *d = NULL;
int r = 0;
d = opendir("/proc/self/fd");
if (!d)
return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
FOREACH_DIRENT(de, d, return -errno) {
int fd = -EBADF, q;
if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
continue;
fd = parse_fd(de->d_name);
if (fd < 0)
/* Let's better ignore this, just in case */
continue;
if (fd < 3)
continue;
if (fd == dirfd(d))
continue;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
if (q != -EBADF) /* Valgrind has its own FD and doesn't want to have it closed */
RET_GATHER(r, q);
}
return r;
}
static bool have_close_range = true; /* Assume we live in the future */
static int close_all_fds_special_case(const int except[], size_t n_except) {
assert(n_except == 0 || except);
@ -326,9 +286,6 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {
* nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
* otherwise. */
if (!have_close_range)
return 0;
if (n_except == 1 && except[0] < 0) /* Minor optimization: if we only got one fd, and it's invalid,
* we got none */
n_except = 0;
@ -337,32 +294,23 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {
case 0:
/* Close everything. Yay! */
if (close_range(3, INT_MAX, 0) >= 0)
return 1;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
if (close_range(3, INT_MAX, 0) < 0)
return -errno;
return 1;
case 1:
/* Close all but exactly one, then we don't need no sorting. This is a pretty common
* case, hence let's handle it specially. */
if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
(except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
return 1;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
if (except[0] > 3 && close_range(3, except[0] - 1, 0) < 0)
return -errno;
if (except[0] < INT_MAX && close_range(MAX(3, except[0] + 1), -1, 0) < 0)
return -errno;
return 1;
default:
return 0;
}
@ -393,9 +341,6 @@ int close_all_fds(const int except[], size_t n_except) {
if (r > 0) /* special case worked! */
return 0;
if (!have_close_range)
return close_all_fds_by_proc(except, n_except);
_cleanup_free_ int *sorted_malloc = NULL;
size_t n_sorted;
int *sorted;
@ -415,7 +360,7 @@ int close_all_fds(const int except[], size_t n_except) {
sorted = newa(int, n_sorted);
if (!sorted) /* Fallback on OOM. */
return close_all_fds_by_proc(except, n_except);
return close_all_fds_frugal(except, n_except);
memcpy(sorted, except, n_except * sizeof(int));
@ -437,13 +382,8 @@ int close_all_fds(const int except[], size_t n_except) {
continue;
/* Close everything between the start and end fds (both of which shall stay open) */
if (close_range(start + 1, end - 1, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
if (close_range(start + 1, end - 1, 0) < 0)
return -errno;
have_close_range = false;
return close_all_fds_by_proc(except, n_except);
}
}
/* The loop succeeded. Let's now close everything beyond the end */
@ -451,14 +391,9 @@ int close_all_fds(const int except[], size_t n_except) {
if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
return 0;
if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0)
return -errno;
have_close_range = false;
return close_all_fds_by_proc(except, n_except);
}
return 0;
}
@ -1119,7 +1054,6 @@ int path_is_root_at(int dir_fd, const char *path) {
int fds_are_same_mount(int fd1, int fd2) {
struct statx sx1 = {}, sx2 = {}; /* explicitly initialize the struct to make msan silent. */
int r;
assert(fd1 >= 0);
assert(fd2 >= 0);
@ -1130,39 +1064,7 @@ int fds_are_same_mount(int fd1, int fd2) {
if (statx(fd2, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx2) < 0)
return -errno;
/* First, compare inode. If these are different, the fd does not point to the root directory "/". */
if (!statx_inode_same(&sx1, &sx2))
return false;
/* Note, statx() does not provide the mount ID and path_get_mnt_id_at() does not work when an old
* kernel is used. In that case, let's assume that we do not have such spurious mount points in an
* early boot stage, and silently skip the following check. */
if (!FLAGS_SET(sx1.stx_mask, STATX_MNT_ID)) {
int mntid;
r = path_get_mnt_id_at_fallback(fd1, "", &mntid);
if (r < 0)
return r;
assert(mntid >= 0);
sx1.stx_mnt_id = mntid;
sx1.stx_mask |= STATX_MNT_ID;
}
if (!FLAGS_SET(sx2.stx_mask, STATX_MNT_ID)) {
int mntid;
r = path_get_mnt_id_at_fallback(fd2, "", &mntid);
if (r < 0)
return r;
assert(mntid >= 0);
sx2.stx_mnt_id = mntid;
sx2.stx_mask |= STATX_MNT_ID;
}
return statx_mount_same(&sx1, &sx2);
return statx_inode_same(&sx1, &sx2) && statx_mount_same(&sx1, &sx2);
}
char* format_proc_fd_path(char buf[static PROC_FD_PATH_MAX], int fd) {

View File

@ -112,7 +112,6 @@ int get_max_fd(void);
int close_all_fds(const int except[], size_t n_except);
int close_all_fds_without_malloc(const int except[], size_t n_except);
int close_all_fds_by_proc(const int except[], size_t n_except);
int close_all_fds_frugal(const int except[], size_t n_except);
int pack_fds(int fds[], size_t n);

View File

@ -13,10 +13,8 @@
#include "log.h"
#include "mountpoint-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
@ -135,57 +133,6 @@ int name_to_handle_at_try_fid(
return name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags & ~AT_HANDLE_FID);
}
static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
_cleanup_close_ int subfd = -EBADF;
int r;
assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
assert(ret_mnt_id);
if ((flags & AT_EMPTY_PATH) && isempty(filename))
xsprintf(path, "/proc/self/fdinfo/%i", fd);
else {
subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW));
if (subfd < 0)
return -errno;
xsprintf(path, "/proc/self/fdinfo/%i", subfd);
}
_cleanup_free_ char *p = NULL;
r = get_proc_field(path, "mnt_id", &p);
if (r == -ENOENT)
return -EBADF;
if (r < 0)
return r;
return safe_atoi(p, ret_mnt_id);
}
static bool filename_possibly_with_slash_suffix(const char *s) {
const char *slash, *copied;
/* Checks whether the specified string is either file name, or a filename with a suffix of
* slashes. But nothing else.
*
* this is OK: foo, bar, foo/, bar/, foo//, bar///
* this is not OK: "", "/", "/foo", "foo/bar", ".", ".." */
slash = strchr(s, '/');
if (!slash)
return filename_is_valid(s);
if (slash - s > PATH_MAX) /* We want to allocate on the stack below, hence do a size check first */
return false;
if (slash[strspn(slash, "/")] != 0) /* Check that the suffix consist only of one or more slashes */
return false;
copied = strndupa_safe(s, slash - s);
return filename_is_valid(copied);
}
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b) {
if (a == b)
return true;
@ -197,191 +144,73 @@ bool file_handle_equal(const struct file_handle *a, const struct file_handle *b)
return memcmp_nn(a->f_handle, a->handle_bytes, b->f_handle, b->handle_bytes) == 0;
}
int is_mount_point_at(int fd, const char *filename, int flags) {
bool fd_is_self;
int is_mount_point_at(int dir_fd, const char *path, int flags) {
int r;
assert(fd >= 0 || fd == AT_FDCWD);
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
if (isempty(filename)) {
if (fd == AT_FDCWD)
filename = ".";
if (path_equal(path, "/"))
return true;
if (isempty(path)) {
if (dir_fd == AT_FDCWD)
path = ".";
else {
/* If the file name is empty we'll see if the specified 'fd' is a mount point.
* That's only supported by statx(), or if the inode specified via 'fd' refers to a
* directory. Otherwise, we'll have to fail (ENOTDIR), because we have no kernel API
* to query the information we need. */
flags |= AT_EMPTY_PATH;
filename = "";
path = "";
}
fd_is_self = true;
} else if (STR_IN_SET(filename, ".", "./"))
fd_is_self = true;
else {
/* Insist that the specified filename is actually a filename, and not a path, i.e. some inode
* further up or down the tree then immediately below the specified directory fd. */
if (!filename_possibly_with_slash_suffix(filename))
return -EINVAL;
fd_is_self = false;
}
/* First we will try statx()' STATX_ATTR_MOUNT_ROOT attribute, which is our ideal API, available
* since kernel 5.8.
*
* If that fails, our second try is the name_to_handle_at() syscall, which tells us the mount id and
* an opaque file "handle". It is not supported everywhere though (kernel compile-time option, not
* all file systems are hooked up). If it works the mount id is usually good enough to tell us
* whether something is a mount point.
*
* If that didn't work we will try to read the mount id from /proc/self/fdinfo/<fd>. This is almost
* as good as name_to_handle_at(), however, does not return the opaque file handle. The opaque file
* handle is pretty useful to detect the root directory, which we should always consider a mount
* point. Hence we use this only as fallback.
*
* Note that traditionally the check is done via fstat()-based st_dev comparisons. However, various
* file systems don't guarantee same st_dev across single fs anymore, e.g. unionfs exposes file systems
* with a variety of st_dev reported. Also, btrfs subvolumes have different st_dev, even though
* they aren't real mounts of their own. */
struct statx sx = {}; /* explicitly initialize the struct to make msan silent. */
if (statx(fd, filename,
if (statx(dir_fd, path,
at_flags_normalize_nofollow(flags) |
AT_NO_AUTOMOUNT | /* don't trigger automounts mounts are a local concept, hence no need to trigger automounts to determine STATX_ATTR_MOUNT_ROOT */
AT_STATX_DONT_SYNC, /* don't go to the network for this for similar reasons */
STATX_TYPE,
STATX_TYPE|STATX_INO,
&sx) < 0)
return -errno;
if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */
return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
_cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
int mount_id = -1, mount_id_parent = -1;
bool nosupp = false;
r = name_to_handle_at_try_fid(fd, filename, &h, &mount_id, flags);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
/* This file system does not support name_to_handle_at(), hence let's see if the upper fs
* supports it (in which case it is a mount point), otherwise fall back to the fdinfo logic. */
nosupp = true;
}
if (fd_is_self)
r = name_to_handle_at_try_fid(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
else
r = name_to_handle_at_try_fid(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
if (nosupp)
/* Both the parent and the directory can't do name_to_handle_at() */
goto fallback_fdinfo;
/* The parent can't do name_to_handle_at() but the directory we are
* interested in can? If so, it must be a mount point. */
return 1;
}
/* The parent can do name_to_handle_at() but the directory we are interested in can't? If
* so, it must be a mount point. */
if (nosupp)
return 1;
/* If the file handle for the directory we are interested in and its parent are identical,
* we assume this is the root directory, which is a mount point. */
if (file_handle_equal(h_parent, h))
return 1;
return mount_id != mount_id_parent;
fallback_fdinfo:
r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
r = statx_warn_mount_root(&sx, LOG_DEBUG);
if (r < 0)
return r;
if (fd_is_self)
r = fd_fdinfo_mnt_id(fd, "..", 0, &mount_id_parent); /* can't work for non-directories 😢 */
else
r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
if (r < 0)
return r;
if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
return true;
if (mount_id != mount_id_parent)
return 1;
/* Hmm, so, the mount ids are the same. This leaves one special case though for the root file
* system. For that, let's see if the parent directory has the same inode as we are interested
* in. */
struct stat a, b;
/* yay for fstatat() taking a different set of flags than the other _at() above */
if (fstatat(fd, filename, &a, at_flags_normalize_nofollow(flags)) < 0)
/* When running on chroot environment, the root may not be a mount point, but we unconditionally
* return true when the input is "/" in the above, but the shortcut may not work e.g. when the path
* is relative. */
struct statx sx2 = {}; /* explicitly initialize the struct to make msan silent. */
if (statx(AT_FDCWD, "/", AT_STATX_DONT_SYNC, STATX_TYPE|STATX_INO, &sx2) < 0)
return -errno;
if (fd_is_self)
r = fstatat(fd, "..", &b, 0);
else
r = fstatat(fd, "", &b, AT_EMPTY_PATH);
if (r < 0)
return -errno;
/* A directory with same device and inode as its parent must be the root directory. Otherwise
* not a mount point.
*
* NB: we avoid inode_same_at() here because it internally attempts name_to_handle_at_try_fid() first,
* which is redundant. */
return stat_inode_same(&a, &b);
return statx_inode_same(&sx, &sx2);
}
/* flags can be AT_SYMLINK_FOLLOW or 0 */
int path_is_mount_point_full(const char *path, const char *root, int flags) {
_cleanup_close_ int dfd = -EBADF;
_cleanup_free_ char *fn = NULL;
_cleanup_close_ int dir_fd = -EBADF;
int r;
assert(path);
assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
if (path_equal(path, "/"))
return 1;
if (empty_or_root(root))
return is_mount_point_at(AT_FDCWD, path, flags);
/* we need to resolve symlinks manually, we can't just rely on is_mount_point_at() to do that for us;
* if we have a structure like /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
* look at needs to be /usr, not /. */
dfd = chase_and_open_parent(path, root,
CHASE_TRAIL_SLASH|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW),
&fn);
if (dfd < 0)
return dfd;
return is_mount_point_at(dfd, fn, flags);
}
int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret) {
int r;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert(ret);
r = name_to_handle_at_loop(dir_fd, path, NULL, ret, isempty(path) ? AT_EMPTY_PATH : 0);
if (r >= 0 || is_name_to_handle_at_fatal_error(r))
r = chase(path, root,
FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW,
/* ret_path= */ NULL, &dir_fd);
if (r < 0)
return r;
return fd_fdinfo_mnt_id(dir_fd, path, isempty(path) ? AT_EMPTY_PATH : 0, ret);
return is_mount_point_at(dir_fd, /* path= */ NULL, flags);
}
int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
struct statx sx;
int r;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert(ret);
@ -395,14 +224,14 @@ int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
&sx) < 0)
return -errno;
if (FLAGS_SET(sx.stx_mask, STATX_MNT_ID)) {
r = statx_warn_mount_id(&sx, LOG_DEBUG);
if (r < 0)
return r;
*ret = sx.stx_mnt_id;
return 0;
}
return path_get_mnt_id_at_fallback(dir_fd, path, ret);
}
bool fstype_is_network(const char *fstype) {
const char *x;

View File

@ -39,13 +39,12 @@ int name_to_handle_at_try_fid(int fd, const char *path, struct file_handle **ret
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b);
int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret);
int path_get_mnt_id_at(int dir_fd, const char *path, int *ret);
static inline int path_get_mnt_id(const char *path, int *ret) {
return path_get_mnt_id_at(AT_FDCWD, path, ret);
}
int is_mount_point_at(int fd, const char *filename, int flags);
int is_mount_point_at(int dir_fd, const char *path, int flags);
int path_is_mount_point_full(const char *path, const char *root, int flags);
static inline int path_is_mount_point(const char *path) {
return path_is_mount_point_full(path, NULL, 0);

View File

@ -2119,15 +2119,14 @@ int posix_spawn_wrapper(
if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
turn that into something recognizable */
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)) &&
FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
/* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
* need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
* Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
* but not CLONE_INTO_CGROUP. */
/* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time. */
* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time.
* Note, CLONE_INTO_CGROUP is supported since kernel v5.7, but some architectures still
* do not support clone3(). Hence, we need to keep the fallback logic for a while. */
have_clone_into_cgroup = false;
flags &= ~POSIX_SPAWN_SETCGROUP;

View File

@ -68,8 +68,6 @@ static void fallback_random_bytes(void *p, size_t n) {
}
void random_bytes(void *p, size_t n) {
static bool have_grndinsecure = true;
assert(p || n == 0);
if (n == 0)
@ -78,15 +76,9 @@ void random_bytes(void *p, size_t n) {
for (;;) {
ssize_t l;
l = getrandom(p, n, have_grndinsecure ? GRND_INSECURE : GRND_NONBLOCK);
if (l < 0 && errno == EINVAL && have_grndinsecure) {
/* No GRND_INSECURE; fallback to GRND_NONBLOCK. */
have_grndinsecure = false;
continue;
}
l = getrandom(p, n, GRND_INSECURE);
if (l <= 0)
break; /* Will block (with GRND_NONBLOCK), or unexpected error. Give up and fallback
to /dev/urandom. */
break; /* Unexpected error. Give up and fallback to /dev/urandom. */
if ((size_t) l == n)
return; /* Done reading, success. */

View File

@ -463,8 +463,13 @@ bool statx_inode_same(const struct statx *a, const struct statx *b) {
/* Same as stat_inode_same() but for struct statx */
return statx_is_set(a) && statx_is_set(b) &&
FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO) && FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO) &&
if (!statx_is_set(a) || !statx_is_set(b))
return false;
assert(FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO));
assert(FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO));
return
((a->stx_mode ^ b->stx_mode) & S_IFMT) == 0 &&
a->stx_dev_major == b->stx_dev_major &&
a->stx_dev_minor == b->stx_dev_minor &&
@ -475,13 +480,10 @@ bool statx_mount_same(const struct statx *a, const struct statx *b) {
if (!statx_is_set(a) || !statx_is_set(b))
return false;
/* if we have the mount ID, that's all we need */
if (FLAGS_SET(a->stx_mask, STATX_MNT_ID) && FLAGS_SET(b->stx_mask, STATX_MNT_ID))
return a->stx_mnt_id == b->stx_mnt_id;
assert(FLAGS_SET(a->stx_mask, STATX_MNT_ID));
assert(FLAGS_SET(b->stx_mask, STATX_MNT_ID));
/* Otherwise, major/minor of backing device must match */
return a->stx_dev_major == b->stx_dev_major &&
a->stx_dev_minor == b->stx_dev_minor;
return a->stx_mnt_id == b->stx_mnt_id;
}
int xstatfsat(int dir_fd, const char *path, struct statfs *ret) {
@ -572,3 +574,25 @@ mode_t inode_type_from_string(const char *s) {
return MODE_INVALID;
}
int statx_warn_mount_root(const struct statx *sx, int log_level) {
assert(sx);
/* The STATX_ATTR_MOUNT_ROOT flag is supported since kernel v5.8. */
if (!FLAGS_SET(sx->stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
"statx() did not set STATX_ATTR_MOUNT_ROOT, running on an old kernel?");
return 0;
}
int statx_warn_mount_id(const struct statx *sx, int log_level) {
assert(sx);
/* The STATX_MNT_ID flag is supported since kernel v5.10. */
if (!FLAGS_SET(sx->stx_mask, STATX_MNT_ID))
return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
"statx() does not support STATX_MNT_ID, running on an old kernel?");
return 0;
}

View File

@ -117,3 +117,6 @@ static inline bool inode_type_can_hardlink(mode_t m) {
* type). */
return IN_SET(m & S_IFMT, S_IFSOCK, S_IFLNK, S_IFREG, S_IFBLK, S_IFCHR, S_IFIFO);
}
int statx_warn_mount_root(const struct statx *sx, int log_level);
int statx_warn_mount_id(const struct statx *sx, int log_level);

View File

@ -3467,7 +3467,7 @@ static int is_extension_overlay(const char *path, int fd) {
fd = dfd;
}
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
if (r < 0)
return log_debug_errno(r, "Unable to determine whether '%s' is a mount point: %m", path);
if (r == 0)

View File

@ -1,7 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include_next <linux/bpf.h> /* IWYU pragma: export */
#include_next <linux/bpf_insn.h> /* IWYU pragma: export */
/* defined in linux/filter.h */
/* Unconditional jumps, goto pc + off16 */

View File

@ -1113,7 +1113,7 @@ static int action_umount(sd_bus *bus, int argc, char **argv) {
if (fstat(fd, &st) < 0)
return log_error_errno(errno, "Can't stat '%s' (from %s): %m", p, argv[i]);
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
fd = safe_close(fd); /* before continuing make sure the dir is not keeping anything busy */
if (r > 0)
RET_GATHER(ret, stop_mounts(bus, p));

View File

@ -147,7 +147,7 @@ int mount_cgroups(const char *dest, bool accept_existing) {
if (r < 0)
return log_error_errno(r, "Failed to chase %s/sys/fs/cgroup: %m", strempty(dest));
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
if (r < 0)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
if (r > 0) {

View File

@ -268,7 +268,7 @@ static int verify_fsroot_dir(
bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
_cleanup_free_ char *f = NULL;
struct statx sxa, sxb;
struct statx sx;
int r;
/* Checks if the specified directory is at the root of its file system, and returns device
@ -287,49 +287,30 @@ static int verify_fsroot_dir(
if (statx(dir_fd, strempty(f),
AT_SYMLINK_NOFOLLOW|(isempty(f) ? AT_EMPTY_PATH : 0),
STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxa) < 0)
STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx) < 0)
return log_full_errno((searching && errno == ENOENT) ||
(unprivileged_mode && ERRNO_IS_PRIVILEGE(errno)) ? LOG_DEBUG : LOG_ERR, errno,
"Failed to determine block device node of \"%s\": %m", path);
if (!S_ISDIR(sxa.stx_mode))
if (!S_ISDIR(sx.stx_mode))
return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Path \"%s\" is not a directory", path);
if (FLAGS_SET(sxa.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
/* If we have STATX_ATTR_MOUNT_ROOT, we are happy, that's all we need. We operate under the
* assumption that a top of a mount point is also the top of the file system. (Which of
* course is strictly speaking not always true...) */
if (!FLAGS_SET(sxa.stx_attributes, STATX_ATTR_MOUNT_ROOT))
if (!FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
"Directory \"%s\" is not the root of the file system.", path);
goto success;
}
/* Now let's look at the parent */
if (statx(dir_fd, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxb) < 0)
return log_full_errno(unprivileged_mode && ERRNO_IS_PRIVILEGE(errno) ? LOG_DEBUG : LOG_ERR, errno,
"Failed to determine block device node of parent of \"%s\": %m", path);
if (statx_inode_same(&sxa, &sxb)) /* for the root dir inode nr for both inodes will be the same */
goto success;
if (statx_mount_same(&sxa, &sxb))
return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
"Directory \"%s\" is not the root of the file system.", path);
success:
if (!ret_dev)
return 0;
if (sxa.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
if (sx.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
return btrfs_get_block_device_at(dir_fd, strempty(f), ret_dev);
*ret_dev = makedev(sxa.stx_dev_major, sxa.stx_dev_minor);
*ret_dev = makedev(sx.stx_dev_major, sx.stx_dev_minor);
return 0;
}

View File

@ -49,19 +49,20 @@ static int cgroupfs_mount_options(int priority, const char *type, char **ret) {
assert(streq(type, "cgroup2"));
assert(ret);
_cleanup_free_ char *opts = NULL;
FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") {
r = mount_option_supported("cgroup2", o, /* value= */ NULL);
/* memory_hugetlb_accounting mount option is since kernel v6.7 (8cba9576df601c384abd334a503c3f6e1e29eefb). */
r = mount_option_supported("cgroup2", "memory_hugetlb_accounting", /* value= */ NULL);
if (r <= 0) {
if (r < 0)
log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o);
else if (r == 0)
log_debug("'%s' not supported by cgroupfs, not using mount option.", o);
else if (!strextend_with_separator(&opts, ",", o))
return log_oom_full(priority);
log_full_errno(priority, r, "Failed to determine whether cgroupfs supports 'memory_hugetlb_accounting' mount option, assuming not: %m");
else
log_debug("'memory_hugetlb_accounting' not supported by cgroupfs, not using mount option.");
*ret = NULL;
return 0;
}
*ret = TAKE_PTR(opts);
return 0;
return strdup_to(ret, "memory_hugetlb_accounting");
}
int mount_cgroupfs(const char *path) {
@ -81,7 +82,7 @@ int mount_cgroupfs(const char *path) {
return r;
/* These options shall be kept in sync with those in mount_table below. */
if (!strprepend_with_separator(&opts, ",", "nsdelegate"))
if (!strprepend_with_separator(&opts, ",", "nsdelegate,memory_recursiveprot"))
return log_oom();
return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
@ -199,7 +200,7 @@ static const MountPoint mount_table[] = {
.what = "cgroup2",
.where = "/sys/fs/cgroup",
.type = "cgroup2",
.options = "nsdelegate",
.options = "nsdelegate,memory_recursiveprot",
.options_fn = cgroupfs_mount_options,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE,

View File

@ -343,19 +343,13 @@ TEST(close_all_fds) {
_exit(EXIT_SUCCESS);
}
ASSERT_OK(r = pidref_safe_fork("(caf-nomalloc)", flags, NULL));
r = ASSERT_OK(pidref_safe_fork("(caf-nomalloc)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_without_malloc);
_exit(EXIT_SUCCESS);
}
ASSERT_OK(r = pidref_safe_fork("(caf-proc)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_by_proc);
_exit(EXIT_SUCCESS);
}
ASSERT_OK(r = pidref_safe_fork("(caf-frugal)", flags, NULL));
r = ASSERT_OK(pidref_safe_fork("(caf-frugal)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_frugal);
_exit(EXIT_SUCCESS);

View File

@ -283,18 +283,16 @@ TEST(is_mount_point_at) {
fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);
assert_se(fd >= 0);
/* Not allowed, since "/" is a path, not a plain filename */
assert_se(is_mount_point_at(fd, "/", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "..", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "../", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "/proc", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "/proc/", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "proc/sys", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "proc/sys/", 0) == -EINVAL);
/* This one definitely is a mount point */
assert_se(is_mount_point_at(fd, "proc", 0) > 0);
assert_se(is_mount_point_at(fd, "proc/", 0) > 0);
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "/", /* flags= */ 0));
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "..", /* flags= */ 0));
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "../", /* flags= */ 0));
r = ASSERT_OK(proc_mounted());
ASSERT_OK_EQ(is_mount_point_at(fd, "/proc", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "/proc/", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "proc", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "proc/", /* flags= */ 0), r);
ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
safe_close(fd);
fd = open("/tmp", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);

View File

@ -559,7 +559,7 @@ static int opendir_and_stat(
bool *ret_mountpoint) {
_cleanup_closedir_ DIR *d = NULL;
struct statx sx1;
struct statx sx;
int r;
assert(path);
@ -586,21 +586,16 @@ static int opendir_and_stat(
return 0;
}
if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx1) < 0)
if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx) < 0)
return log_error_errno(errno, "statx(%s) failed: %m", path);
if (FLAGS_SET(sx1.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
*ret_mountpoint = FLAGS_SET(sx1.stx_attributes, STATX_ATTR_MOUNT_ROOT);
else {
struct statx sx2;
if (statx(dirfd(d), "..", 0, STATX_INO, &sx2) < 0)
return log_error_errno(errno, "statx(%s/..) failed: %m", path);
*ret_mountpoint = !statx_mount_same(&sx1, &sx2);
}
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
*ret_mountpoint = FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
*ret = TAKE_PTR(d);
*ret_sx = sx1;
*ret_sx = sx;
return 1;
}
@ -713,36 +708,14 @@ static int dir_cleanup(
continue;
}
if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
/* Yay, we have the mount point API, use it */
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name);
continue;
}
} else {
/* So we might have statx() but the STATX_ATTR_MOUNT_ROOT flag is not supported, fall
* back to traditional stx_dev checking. */
if (sx.stx_dev_major != rootdev_major ||
sx.stx_dev_minor != rootdev_minor) {
log_debug("Ignoring \"%s/%s\": different filesystem.", p, de->d_name);
continue;
}
/* Try to detect bind mounts of the same filesystem instance; they do not differ in
* device major/minors. This type of query is not supported on all kernels or
* filesystem types though. */
if (S_ISDIR(sx.stx_mode)) {
int q;
q = is_mount_point_at(dirfd(d), de->d_name, 0);
if (q < 0)
log_debug_errno(q, "Failed to determine whether \"%s/%s\" is a mount point, ignoring: %m", p, de->d_name);
else if (q > 0) {
log_debug("Ignoring \"%s/%s\": different mount of the same filesystem.", p, de->d_name);
continue;
}
}
}
atime_nsec = FLAGS_SET(sx.stx_mask, STATX_ATIME) ? statx_timestamp_load_nsec(&sx.stx_atime) : 0;
mtime_nsec = FLAGS_SET(sx.stx_mask, STATX_MTIME) ? statx_timestamp_load_nsec(&sx.stx_mtime) : 0;

View File

@ -426,7 +426,7 @@ static int run(int argc, char *argv[]) {
if (target_fd < 0)
return log_error_errno(target_fd, "Failed to open directory '%s': %m", arg_target);
r = is_mount_point_at(target_fd, /* filename= */ NULL, /* flags= */ 0);
r = is_mount_point_at(target_fd, /* path= */ NULL, /* flags= */ 0);
if (r < 0)
return log_error_errno(r, "Failed to determine whether '%s' is a mount point: %m", resolved);
if (!r)