2026-03-18 19:14:46 +01:00
20 changed files with 465 additions and 159 deletions
--- a/22
+++ b/22
@ -47,25 +47,25 @@ REQUIREMENTS:
                     ≥ 5.3 for bounded loops in BPF program, keyring namespacing,
                               and nexthop support
                     ≥ 5.4 for pidfd and signed Verity images
-                     ≥ 5.6 for getrandom() GRND_INSECURE
-                     ≥ 5.7 for CLONE_INTO_CGROUP, cgroup2fs memory_recursiveprot option,
-                               BPF links and the BPF LSM hook
-                     ≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
-                     ≥ 5.9 for close_range()
-                     ≥ 5.10 for STATX_MNT_ID

-        ⛔ Kernel versions below 5.10 ("minimum baseline") are not supported at all,
+        ⛔ Kernel versions below 5.4 ("minimum baseline") are not supported at all,
           and are missing required functionality as listed above.

-        Linux kernel ≥ 5.12 for idmapped mount
-                     ≥ 5.14 for cgroup.kill and quotactl_fd()
+        Linux kernel ≥ 5.6 for getrandom() GRND_INSECURE
+                     ≥ 5.7 for CLONE_INTO_CGROUP, cgroup2fs memory_recursiveprot option,
+                               BPF links and the BPF LSM hook

-        ⚠️ Kernel versions below 5.14 ("recommended baseline") have significant gaps
+        ⚠️ Kernel versions below 5.7 ("recommended baseline") have significant gaps
           in functionality and are not recommended for use with this version
           of systemd. Taint flag 'old-kernel' will be set. systemd will most likely
           still function, but upstream support and testing are limited.

-        Linux kernel ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
+        Linux kernel ≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
+                     ≥ 5.9 for close_range()
+                     ≥ 5.12 for idmapped mount
+                     ≥ 5.14 for cgroup.kill
+                     ≥ 5.14 for quotactl_fd()
+                     ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
                     ≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
                               and MOVE_MOUNT_BENEATH
                     ≥ 6.6 for quota support on tmpfs
--- a/src/basic/constants.h
+++ b/src/basic/constants.h
@ -68,4 +68,4 @@
 #define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine"

 /* Recommended baseline - see README for details */
-#define KERNEL_BASELINE_VERSION "5.14"
+#define KERNEL_BASELINE_VERSION "5.7"
--- a/src/basic/fd-util.c
+++ b/src/basic/fd-util.c
@ -16,6 +16,7 @@
 #include "format-util.h"
 #include "fs-util.h"
 #include "log.h"
+#include "mountpoint-util.h"
 #include "parse-util.h"
 #include "path-util.h"
 #include "process-util.h"
@ -251,9 +252,10 @@ int close_all_fds_frugal(const int except[], size_t n_except) {

        assert(except || n_except == 0);

-        /* This is the inner fallback core of close_all_fds(). This never calls malloc() or so and hence is
-         * safe to be called in signal handler context. Most users should call close_all_fds(), but when we
-         * assume we are called from signal handler context, then use this simpler call instead. */
+        /* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
+         * and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
+         * but when we assume we are called from signal handler context, then use this simpler call
+         * instead. */

        max_fd = get_max_fd();
        if (max_fd < 0)
@ -279,6 +281,44 @@ int close_all_fds_frugal(const int except[], size_t n_except) {
        return r;
 }

+int close_all_fds_by_proc(const int except[], size_t n_except) {
+        _cleanup_closedir_ DIR *d = NULL;
+        int r = 0;
+
+        d = opendir("/proc/self/fd");
+        if (!d)
+                return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
+
+        FOREACH_DIRENT(de, d, return -errno) {
+                int fd = -EBADF, q;
+
+                if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
+                        continue;
+
+                fd = parse_fd(de->d_name);
+                if (fd < 0)
+                        /* Let's better ignore this, just in case */
+                        continue;
+
+                if (fd < 3)
+                        continue;
+
+                if (fd == dirfd(d))
+                        continue;
+
+                if (fd_in_set(fd, except, n_except))
+                        continue;
+
+                q = close_nointr(fd);
+                if (q != -EBADF) /* Valgrind has its own FD and doesn't want to have it closed */
+                        RET_GATHER(r, q);
+        }
+
+        return r;
+}
+
+static bool have_close_range = true; /* Assume we live in the future */
+
 static int close_all_fds_special_case(const int except[], size_t n_except) {
        assert(n_except == 0 || except);

@ -286,6 +326,9 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {
         * nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
         * otherwise. */

+        if (!have_close_range)
+                return 0;
+
        if (n_except == 1 && except[0] < 0) /* Minor optimization: if we only got one fd, and it's invalid,
                                             * we got none */
                n_except = 0;
@ -294,22 +337,31 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {

        case 0:
                /* Close everything. Yay! */
-                if (close_range(3, INT_MAX, 0) < 0)
-                        return -errno;

-                return 1;
+                if (close_range(3, INT_MAX, 0) >= 0)
+                        return 1;
+
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
+                        have_close_range = false;
+                        return 0;
+                }
+
+                return -errno;

        case 1:
                /* Close all but exactly one, then we don't need no sorting. This is a pretty common
                 * case, hence let's handle it specially. */

-                if (except[0] > 3 && close_range(3, except[0] - 1, 0) < 0)
-                        return -errno;
+                if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
+                    (except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
+                        return 1;

-                if (except[0] < INT_MAX && close_range(MAX(3, except[0] + 1), -1, 0) < 0)
-                        return -errno;
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
+                        have_close_range = false;
+                        return 0;
+                }

-                return 1;
+                return -errno;

        default:
                return 0;
@ -341,6 +393,9 @@ int close_all_fds(const int except[], size_t n_except) {
        if (r > 0) /* special case worked! */
                return 0;

+        if (!have_close_range)
+                return close_all_fds_by_proc(except, n_except);
+
        _cleanup_free_ int *sorted_malloc = NULL;
        size_t n_sorted;
        int *sorted;
@ -360,7 +415,7 @@ int close_all_fds(const int except[], size_t n_except) {
                sorted = newa(int, n_sorted);

        if (!sorted) /* Fallback on OOM. */
-                return close_all_fds_frugal(except, n_except);
+                return close_all_fds_by_proc(except, n_except);

        memcpy(sorted, except, n_except * sizeof(int));

@ -382,8 +437,13 @@ int close_all_fds(const int except[], size_t n_except) {
                        continue;

                /* Close everything between the start and end fds (both of which shall stay open) */
-                if (close_range(start + 1, end - 1, 0) < 0)
-                        return -errno;
+                if (close_range(start + 1, end - 1, 0) < 0) {
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+
+                        have_close_range = false;
+                        return close_all_fds_by_proc(except, n_except);
+                }
        }

        /* The loop succeeded. Let's now close everything beyond the end */
@ -391,8 +451,13 @@ int close_all_fds(const int except[], size_t n_except) {
        if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
                return 0;

-        if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0)
-                return -errno;
+        if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0) {
+                if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                        return -errno;
+
+                have_close_range = false;
+                return close_all_fds_by_proc(except, n_except);
+        }

        return 0;
 }
@ -1054,6 +1119,7 @@ int path_is_root_at(int dir_fd, const char *path) {

 int fds_are_same_mount(int fd1, int fd2) {
        struct statx sx1 = {}, sx2 = {}; /* explicitly initialize the struct to make msan silent. */
+        int r;

        assert(fd1 >= 0);
        assert(fd2 >= 0);
@ -1064,7 +1130,39 @@ int fds_are_same_mount(int fd1, int fd2) {
        if (statx(fd2, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx2) < 0)
                return -errno;

-        return statx_inode_same(&sx1, &sx2) && statx_mount_same(&sx1, &sx2);
+        /* First, compare inode. If these are different, the fd does not point to the root directory "/". */
+        if (!statx_inode_same(&sx1, &sx2))
+                return false;
+
+        /* Note, statx() does not provide the mount ID and path_get_mnt_id_at() does not work when an old
+         * kernel is used. In that case, let's assume that we do not have such spurious mount points in an
+         * early boot stage, and silently skip the following check. */
+
+        if (!FLAGS_SET(sx1.stx_mask, STATX_MNT_ID)) {
+                int mntid;
+
+                r = path_get_mnt_id_at_fallback(fd1, "", &mntid);
+                if (r < 0)
+                        return r;
+                assert(mntid >= 0);
+
+                sx1.stx_mnt_id = mntid;
+                sx1.stx_mask |= STATX_MNT_ID;
+        }
+
+        if (!FLAGS_SET(sx2.stx_mask, STATX_MNT_ID)) {
+                int mntid;
+
+                r = path_get_mnt_id_at_fallback(fd2, "", &mntid);
+                if (r < 0)
+                        return r;
+                assert(mntid >= 0);
+
+                sx2.stx_mnt_id = mntid;
+                sx2.stx_mask |= STATX_MNT_ID;
+        }
+
+        return statx_mount_same(&sx1, &sx2);
 }

 char* format_proc_fd_path(char buf[static PROC_FD_PATH_MAX], int fd) {
--- a/src/basic/fd-util.h
+++ b/src/basic/fd-util.h
@ -112,6 +112,7 @@ int get_max_fd(void);

 int close_all_fds(const int except[], size_t n_except);
 int close_all_fds_without_malloc(const int except[], size_t n_except);
+int close_all_fds_by_proc(const int except[], size_t n_except);
 int close_all_fds_frugal(const int except[], size_t n_except);

 int pack_fds(int fds[], size_t n);
--- a/src/basic/mountpoint-util.c
+++ b/src/basic/mountpoint-util.c
@ -13,8 +13,10 @@
 #include "log.h"
 #include "mountpoint-util.h"
 #include "nulstr-util.h"
+#include "parse-util.h"
 #include "path-util.h"
 #include "stat-util.h"
+#include "stdio-util.h"
 #include "string-util.h"
 #include "strv.h"

@ -133,6 +135,57 @@ int name_to_handle_at_try_fid(
        return name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags & ~AT_HANDLE_FID);
 }

+static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
+        char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
+        _cleanup_close_ int subfd = -EBADF;
+        int r;
+
+        assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
+        assert(ret_mnt_id);
+
+        if ((flags & AT_EMPTY_PATH) && isempty(filename))
+                xsprintf(path, "/proc/self/fdinfo/%i", fd);
+        else {
+                subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW));
+                if (subfd < 0)
+                        return -errno;
+
+                xsprintf(path, "/proc/self/fdinfo/%i", subfd);
+        }
+
+        _cleanup_free_ char *p = NULL;
+        r = get_proc_field(path, "mnt_id", &p);
+        if (r == -ENOENT)
+                return -EBADF;
+        if (r < 0)
+                return r;
+
+        return safe_atoi(p, ret_mnt_id);
+}
+
+static bool filename_possibly_with_slash_suffix(const char *s) {
+        const char *slash, *copied;
+
+        /* Checks whether the specified string is either file name, or a filename with a suffix of
+         * slashes. But nothing else.
+         *
+         * this is OK: foo, bar, foo/, bar/, foo//, bar///
+         * this is not OK: "", "/", "/foo", "foo/bar", ".", ".." … */
+
+        slash = strchr(s, '/');
+        if (!slash)
+                return filename_is_valid(s);
+
+        if (slash - s > PATH_MAX) /* We want to allocate on the stack below, hence do a size check first */
+                return false;
+
+        if (slash[strspn(slash, "/")] != 0) /* Check that the suffix consist only of one or more slashes */
+                return false;
+
+        copied = strndupa_safe(s, slash - s);
+        return filename_is_valid(copied);
+}
+
 bool file_handle_equal(const struct file_handle *a, const struct file_handle *b) {
        if (a == b)
                return true;
@ -144,73 +197,191 @@ bool file_handle_equal(const struct file_handle *a, const struct file_handle *b)
        return memcmp_nn(a->f_handle, a->handle_bytes, b->f_handle, b->handle_bytes) == 0;
 }

-int is_mount_point_at(int dir_fd, const char *path, int flags) {
+int is_mount_point_at(int fd, const char *filename, int flags) {
+        bool fd_is_self;
        int r;

-        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+        assert(fd >= 0 || fd == AT_FDCWD);
        assert((flags & ~AT_SYMLINK_FOLLOW) == 0);

-        if (path_equal(path, "/"))
-                return true;
-
-        if (isempty(path)) {
-                if (dir_fd == AT_FDCWD)
-                        path = ".";
+        if (isempty(filename)) {
+                if (fd == AT_FDCWD)
+                        filename = ".";
                else {
+                        /* If the file name is empty we'll see if the specified 'fd' is a mount point.
+                         * That's only supported by statx(), or if the inode specified via 'fd' refers to a
+                         * directory. Otherwise, we'll have to fail (ENOTDIR), because we have no kernel API
+                         * to query the information we need. */
                        flags |= AT_EMPTY_PATH;
-                        path = "";
+                        filename = "";
                }
+
+                fd_is_self = true;
+        } else if (STR_IN_SET(filename, ".", "./"))
+                fd_is_self = true;
+        else {
+                /* Insist that the specified filename is actually a filename, and not a path, i.e. some inode
+                 * further up or down the tree then immediately below the specified directory fd. */
+                if (!filename_possibly_with_slash_suffix(filename))
+                        return -EINVAL;
+
+                fd_is_self = false;
        }

+        /* First we will try statx()' STATX_ATTR_MOUNT_ROOT attribute, which is our ideal API, available
+         * since kernel 5.8.
+         *
+         * If that fails, our second try is the name_to_handle_at() syscall, which tells us the mount id and
+         * an opaque file "handle". It is not supported everywhere though (kernel compile-time option, not
+         * all file systems are hooked up). If it works the mount id is usually good enough to tell us
+         * whether something is a mount point.
+         *
+         * If that didn't work we will try to read the mount id from /proc/self/fdinfo/<fd>. This is almost
+         * as good as name_to_handle_at(), however, does not return the opaque file handle. The opaque file
+         * handle is pretty useful to detect the root directory, which we should always consider a mount
+         * point. Hence we use this only as fallback.
+         *
+         * Note that traditionally the check is done via fstat()-based st_dev comparisons. However, various
+         * file systems don't guarantee same st_dev across single fs anymore, e.g. unionfs exposes file systems
+         * with a variety of st_dev reported. Also, btrfs subvolumes have different st_dev, even though
+         * they aren't real mounts of their own. */
+
        struct statx sx = {}; /* explicitly initialize the struct to make msan silent. */
-        if (statx(dir_fd, path,
+        if (statx(fd, filename,
                  at_flags_normalize_nofollow(flags) |
                  AT_NO_AUTOMOUNT |            /* don't trigger automounts – mounts are a local concept, hence no need to trigger automounts to determine STATX_ATTR_MOUNT_ROOT */
                  AT_STATX_DONT_SYNC,          /* don't go to the network for this – for similar reasons */
-                  STATX_TYPE|STATX_INO,
+                  STATX_TYPE,
                  &sx) < 0)
                return -errno;

-        r = statx_warn_mount_root(&sx, LOG_DEBUG);
+        if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */
+                return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
+
+        _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
+        int mount_id = -1, mount_id_parent = -1;
+        bool nosupp = false;
+
+        r = name_to_handle_at_try_fid(fd, filename, &h, &mount_id, flags);
+        if (r < 0) {
+                if (is_name_to_handle_at_fatal_error(r))
+                        return r;
+                if (!ERRNO_IS_NOT_SUPPORTED(r))
+                        goto fallback_fdinfo;
+
+                /* This file system does not support name_to_handle_at(), hence let's see if the upper fs
+                 * supports it (in which case it is a mount point), otherwise fall back to the fdinfo logic. */
+                nosupp = true;
+        }
+
+        if (fd_is_self)
+                r = name_to_handle_at_try_fid(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
+        else
+                r = name_to_handle_at_try_fid(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
+        if (r < 0) {
+                if (is_name_to_handle_at_fatal_error(r))
+                        return r;
+                if (!ERRNO_IS_NOT_SUPPORTED(r))
+                        goto fallback_fdinfo;
+                if (nosupp)
+                        /* Both the parent and the directory can't do name_to_handle_at() */
+                        goto fallback_fdinfo;
+
+                /* The parent can't do name_to_handle_at() but the directory we are
+                 * interested in can?  If so, it must be a mount point. */
+                return 1;
+        }
+
+        /* The parent can do name_to_handle_at() but the directory we are interested in can't? If
+         * so, it must be a mount point. */
+        if (nosupp)
+                return 1;
+
+        /* If the file handle for the directory we are interested in and its parent are identical,
+         * we assume this is the root directory, which is a mount point. */
+        if (file_handle_equal(h_parent, h))
+                return 1;
+
+        return mount_id != mount_id_parent;
+
+fallback_fdinfo:
+        r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
        if (r < 0)
                return r;

-        if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
-                return true;
+        if (fd_is_self)
+                r = fd_fdinfo_mnt_id(fd, "..", 0, &mount_id_parent); /* can't work for non-directories 😢 */
+        else
+                r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
+        if (r < 0)
+                return r;

-        /* When running on chroot environment, the root may not be a mount point, but we unconditionally
-         * return true when the input is "/" in the above, but the shortcut may not work e.g. when the path
-         * is relative. */
-        struct statx sx2 = {}; /* explicitly initialize the struct to make msan silent. */
-        if (statx(AT_FDCWD, "/", AT_STATX_DONT_SYNC, STATX_TYPE|STATX_INO, &sx2) < 0)
+        if (mount_id != mount_id_parent)
+                return 1;
+
+        /* Hmm, so, the mount ids are the same. This leaves one special case though for the root file
+         * system. For that, let's see if the parent directory has the same inode as we are interested
+         * in. */
+
+        struct stat a, b;
+
+        /* yay for fstatat() taking a different set of flags than the other _at() above */
+        if (fstatat(fd, filename, &a, at_flags_normalize_nofollow(flags)) < 0)
                return -errno;

-        return statx_inode_same(&sx, &sx2);
+        if (fd_is_self)
+                r = fstatat(fd, "..", &b, 0);
+        else
+                r = fstatat(fd, "", &b, AT_EMPTY_PATH);
+        if (r < 0)
+                return -errno;
+
+        /* A directory with same device and inode as its parent must be the root directory. Otherwise
+         * not a mount point.
+         *
+         * NB: we avoid inode_same_at() here because it internally attempts name_to_handle_at_try_fid() first,
+         * which is redundant. */
+        return stat_inode_same(&a, &b);
 }

 /* flags can be AT_SYMLINK_FOLLOW or 0 */
 int path_is_mount_point_full(const char *path, const char *root, int flags) {
-        _cleanup_close_ int dir_fd = -EBADF;
-        int r;
+        _cleanup_close_ int dfd = -EBADF;
+        _cleanup_free_ char *fn = NULL;

        assert(path);
        assert((flags & ~AT_SYMLINK_FOLLOW) == 0);

-        if (empty_or_root(root))
-                return is_mount_point_at(AT_FDCWD, path, flags);
+        if (path_equal(path, "/"))
+                return 1;

-        r = chase(path, root,
-                  FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW,
-                  /* ret_path= */ NULL, &dir_fd);
-        if (r < 0)
+        /* we need to resolve symlinks manually, we can't just rely on is_mount_point_at() to do that for us;
+         * if we have a structure like /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
+         * look at needs to be /usr, not /. */
+        dfd = chase_and_open_parent(path, root,
+                                    CHASE_TRAIL_SLASH|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW),
+                                    &fn);
+        if (dfd < 0)
+                return dfd;
+
+        return is_mount_point_at(dfd, fn, flags);
+}
+
+int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret) {
+        int r;
+
+        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+        assert(ret);
+
+        r = name_to_handle_at_loop(dir_fd, path, NULL, ret, isempty(path) ? AT_EMPTY_PATH : 0);
+        if (r >= 0 || is_name_to_handle_at_fatal_error(r))
                return r;

-        return is_mount_point_at(dir_fd, /* path= */ NULL, flags);
+        return fd_fdinfo_mnt_id(dir_fd, path, isempty(path) ? AT_EMPTY_PATH : 0, ret);
 }

 int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
        struct statx sx;
-        int r;

        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
        assert(ret);
@ -224,12 +395,12 @@ int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
                  &sx) < 0)
                return -errno;

-        r = statx_warn_mount_id(&sx, LOG_DEBUG);
-        if (r < 0)
-                return r;
+        if (FLAGS_SET(sx.stx_mask, STATX_MNT_ID)) {
+                *ret = sx.stx_mnt_id;
+                return 0;
+        }

-        *ret = sx.stx_mnt_id;
-        return 0;
+        return path_get_mnt_id_at_fallback(dir_fd, path, ret);
 }

 bool fstype_is_network(const char *fstype) {
--- a/src/basic/mountpoint-util.h
+++ b/src/basic/mountpoint-util.h
@ -39,12 +39,13 @@ int name_to_handle_at_try_fid(int fd, const char *path, struct file_handle **ret

 bool file_handle_equal(const struct file_handle *a, const struct file_handle *b);

+int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret);
 int path_get_mnt_id_at(int dir_fd, const char *path, int *ret);
 static inline int path_get_mnt_id(const char *path, int *ret) {
        return path_get_mnt_id_at(AT_FDCWD, path, ret);
 }

-int is_mount_point_at(int dir_fd, const char *path, int flags);
+int is_mount_point_at(int fd, const char *filename, int flags);
 int path_is_mount_point_full(const char *path, const char *root, int flags);
 static inline int path_is_mount_point(const char *path) {
        return path_is_mount_point_full(path, NULL, 0);
--- a/src/basic/process-util.c
+++ b/src/basic/process-util.c
@ -2119,14 +2119,15 @@ int posix_spawn_wrapper(
        if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
                return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
                                    turn that into something recognizable */
-        if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)) &&
+        if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
            FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
                /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
                 * need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
-                 * CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
-                 * retry every time.
-                 * Note, CLONE_INTO_CGROUP is supported since kernel v5.7, but some architectures still
-                 * do not support clone3(). Hence, we need to keep the fallback logic for a while. */
+                 * Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
+                 * but not CLONE_INTO_CGROUP. */
+
+                /* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
+                 * retry every time. */
                have_clone_into_cgroup = false;

                flags &= ~POSIX_SPAWN_SETCGROUP;
--- a/src/basic/random-util.c
+++ b/src/basic/random-util.c
@ -68,6 +68,8 @@ static void fallback_random_bytes(void *p, size_t n) {
 }

 void random_bytes(void *p, size_t n) {
+        static bool have_grndinsecure = true;
+
        assert(p || n == 0);

        if (n == 0)
@ -76,9 +78,15 @@ void random_bytes(void *p, size_t n) {
        for (;;) {
                ssize_t l;

-                l = getrandom(p, n, GRND_INSECURE);
+                l = getrandom(p, n, have_grndinsecure ? GRND_INSECURE : GRND_NONBLOCK);
+                if (l < 0 && errno == EINVAL && have_grndinsecure) {
+                        /* No GRND_INSECURE; fallback to GRND_NONBLOCK. */
+                        have_grndinsecure = false;
+                        continue;
+                }
                if (l <= 0)
-                        break; /* Unexpected error. Give up and fallback to /dev/urandom. */
+                        break; /* Will block (with GRND_NONBLOCK), or unexpected error. Give up and fallback
+                                  to /dev/urandom. */

                if ((size_t) l == n)
                        return; /* Done reading, success. */
--- a/src/basic/stat-util.c
+++ b/src/basic/stat-util.c
@ -463,13 +463,8 @@ bool statx_inode_same(const struct statx *a, const struct statx *b) {

        /* Same as stat_inode_same() but for struct statx */

-        if (!statx_is_set(a) || !statx_is_set(b))
-                return false;
-
-        assert(FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO));
-        assert(FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO));
-
-        return
+        return statx_is_set(a) && statx_is_set(b) &&
+                FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO) && FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO) &&
                ((a->stx_mode ^ b->stx_mode) & S_IFMT) == 0 &&
                a->stx_dev_major == b->stx_dev_major &&
                a->stx_dev_minor == b->stx_dev_minor &&
@ -480,10 +475,13 @@ bool statx_mount_same(const struct statx *a, const struct statx *b) {
        if (!statx_is_set(a) || !statx_is_set(b))
                return false;

-        assert(FLAGS_SET(a->stx_mask, STATX_MNT_ID));
-        assert(FLAGS_SET(b->stx_mask, STATX_MNT_ID));
+        /* if we have the mount ID, that's all we need */
+        if (FLAGS_SET(a->stx_mask, STATX_MNT_ID) && FLAGS_SET(b->stx_mask, STATX_MNT_ID))
+                return a->stx_mnt_id == b->stx_mnt_id;

-        return a->stx_mnt_id == b->stx_mnt_id;
+        /* Otherwise, major/minor of backing device must match */
+        return a->stx_dev_major == b->stx_dev_major &&
+                a->stx_dev_minor == b->stx_dev_minor;
 }

 int xstatfsat(int dir_fd, const char *path, struct statfs *ret) {
@ -574,25 +572,3 @@ mode_t inode_type_from_string(const char *s) {

        return MODE_INVALID;
 }
-
-int statx_warn_mount_root(const struct statx *sx, int log_level) {
-        assert(sx);
-
-        /* The STATX_ATTR_MOUNT_ROOT flag is supported since kernel v5.8. */
-        if (!FLAGS_SET(sx->stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
-                return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
-                                      "statx() did not set STATX_ATTR_MOUNT_ROOT, running on an old kernel?");
-
-        return 0;
-}
-
-int statx_warn_mount_id(const struct statx *sx, int log_level) {
-        assert(sx);
-
-        /* The STATX_MNT_ID flag is supported since kernel v5.10. */
-        if (!FLAGS_SET(sx->stx_mask, STATX_MNT_ID))
-                return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
-                                      "statx() does not support STATX_MNT_ID, running on an old kernel?");
-
-        return 0;
-}
--- a/src/basic/stat-util.h
+++ b/src/basic/stat-util.h
@ -117,6 +117,3 @@ static inline bool inode_type_can_hardlink(mode_t m) {
         * type). */
        return IN_SET(m & S_IFMT, S_IFSOCK, S_IFLNK, S_IFREG, S_IFBLK, S_IFCHR, S_IFIFO);
 }
-
-int statx_warn_mount_root(const struct statx *sx, int log_level);
-int statx_warn_mount_id(const struct statx *sx, int log_level);
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@ -3467,7 +3467,7 @@ static int is_extension_overlay(const char *path, int fd) {
                fd = dfd;
        }

-        r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
+        r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
        if (r < 0)
                return log_debug_errno(r, "Unable to determine whether '%s' is a mount point: %m", path);
        if (r == 0)
--- a/src/include/override/linux/bpf_insn.h
+++ b/src/include/override/linux/bpf_insn.h
@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */
 #pragma once

-#include_next <linux/bpf_insn.h>     /* IWYU pragma: export */
+#include_next <linux/bpf.h>     /* IWYU pragma: export */

 /* defined in linux/filter.h */
 /* Unconditional jumps, goto pc + off16 */
--- a/src/mount/mount-tool.c
+++ b/src/mount/mount-tool.c
@ -1113,7 +1113,7 @@ static int action_umount(sd_bus *bus, int argc, char **argv) {
                if (fstat(fd, &st) < 0)
                        return log_error_errno(errno, "Can't stat '%s' (from %s): %m", p, argv[i]);

-                r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
+                r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
                fd = safe_close(fd); /* before continuing make sure the dir is not keeping anything busy */
                if (r > 0)
                        RET_GATHER(ret, stop_mounts(bus, p));
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@ -147,7 +147,7 @@ int mount_cgroups(const char *dest, bool accept_existing) {
        if (r < 0)
                return log_error_errno(r, "Failed to chase %s/sys/fs/cgroup: %m", strempty(dest));

-        r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
+        r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
        if (r < 0)
                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
        if (r > 0) {
--- a/src/shared/find-esp.c
+++ b/src/shared/find-esp.c
@ -268,7 +268,7 @@ static int verify_fsroot_dir(
        bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
                unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
        _cleanup_free_ char *f = NULL;
-        struct statx sx;
+        struct statx sxa, sxb;
        int r;

        /* Checks if the specified directory is at the root of its file system, and returns device
@ -287,30 +287,49 @@ static int verify_fsroot_dir(

        if (statx(dir_fd, strempty(f),
                  AT_SYMLINK_NOFOLLOW|(isempty(f) ? AT_EMPTY_PATH : 0),
-                  STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx) < 0)
+                  STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxa) < 0)
                return log_full_errno((searching && errno == ENOENT) ||
                                      (unprivileged_mode && ERRNO_IS_PRIVILEGE(errno)) ? LOG_DEBUG : LOG_ERR, errno,
                                      "Failed to determine block device node of \"%s\": %m", path);

-        if (!S_ISDIR(sx.stx_mode))
+        if (!S_ISDIR(sxa.stx_mode))
                return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Path \"%s\" is not a directory", path);

-        r = statx_warn_mount_root(&sx, LOG_ERR);
-        if (r < 0)
-                return r;
+        if (FLAGS_SET(sxa.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {

-        if (!FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
+                /* If we have STATX_ATTR_MOUNT_ROOT, we are happy, that's all we need. We operate under the
+                 * assumption that a top of a mount point is also the top of the file system. (Which of
+                 * course is strictly speaking not always true...) */
+
+                if (!FLAGS_SET(sxa.stx_attributes, STATX_ATTR_MOUNT_ROOT))
+                        return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+                                              SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+                                              "Directory \"%s\" is not the root of the file system.", path);
+
+                goto success;
+        }
+
+        /* Now let's look at the parent */
+        if (statx(dir_fd, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxb) < 0)
+                return log_full_errno(unprivileged_mode && ERRNO_IS_PRIVILEGE(errno) ? LOG_DEBUG : LOG_ERR, errno,
+                                      "Failed to determine block device node of parent of \"%s\": %m", path);
+
+        if (statx_inode_same(&sxa, &sxb)) /* for the root dir inode nr for both inodes will be the same */
+                goto success;
+
+        if (statx_mount_same(&sxa, &sxb))
                return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
                                      SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
                                      "Directory \"%s\" is not the root of the file system.", path);

+success:
        if (!ret_dev)
                return 0;

-        if (sx.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
+        if (sxa.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
                return btrfs_get_block_device_at(dir_fd, strempty(f), ret_dev);

-        *ret_dev = makedev(sx.stx_dev_major, sx.stx_dev_minor);
+        *ret_dev = makedev(sxa.stx_dev_major, sxa.stx_dev_minor);
        return 0;
 }

--- a/src/shared/mount-setup.c
+++ b/src/shared/mount-setup.c
@ -49,20 +49,19 @@ static int cgroupfs_mount_options(int priority, const char *type, char **ret) {
        assert(streq(type, "cgroup2"));
        assert(ret);

-        /* memory_hugetlb_accounting mount option is since kernel v6.7 (8cba9576df601c384abd334a503c3f6e1e29eefb). */
-
-        r = mount_option_supported("cgroup2", "memory_hugetlb_accounting", /* value= */ NULL);
-        if (r <= 0) {
+        _cleanup_free_ char *opts = NULL;
+        FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") {
+                r = mount_option_supported("cgroup2", o, /* value= */ NULL);
                if (r < 0)
-                        log_full_errno(priority, r, "Failed to determine whether cgroupfs supports 'memory_hugetlb_accounting' mount option, assuming not: %m");
-                else
-                        log_debug("'memory_hugetlb_accounting' not supported by cgroupfs, not using mount option.");
-
-                *ret = NULL;
-                return 0;
+                        log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o);
+                else if (r == 0)
+                        log_debug("'%s' not supported by cgroupfs, not using mount option.", o);
+                else if (!strextend_with_separator(&opts, ",", o))
+                        return log_oom_full(priority);
        }

-        return strdup_to(ret, "memory_hugetlb_accounting");
+        *ret = TAKE_PTR(opts);
+        return 0;
 }

 int mount_cgroupfs(const char *path) {
@ -82,7 +81,7 @@ int mount_cgroupfs(const char *path) {
                return r;

        /* These options shall be kept in sync with those in mount_table below. */
-        if (!strprepend_with_separator(&opts, ",", "nsdelegate,memory_recursiveprot"))
+        if (!strprepend_with_separator(&opts, ",", "nsdelegate"))
                return log_oom();

        return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
@ -200,7 +199,7 @@ static const MountPoint mount_table[] = {
                .what = "cgroup2",
                .where = "/sys/fs/cgroup",
                .type = "cgroup2",
-                .options = "nsdelegate,memory_recursiveprot",
+                .options = "nsdelegate",
                .options_fn = cgroupfs_mount_options,
                .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
                .mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE,
--- a/src/test/test-fd-util.c
+++ b/src/test/test-fd-util.c
@ -343,13 +343,19 @@ TEST(close_all_fds) {
                _exit(EXIT_SUCCESS);
        }

-        r = ASSERT_OK(pidref_safe_fork("(caf-nomalloc)", flags, NULL));
+        ASSERT_OK(r = pidref_safe_fork("(caf-nomalloc)", flags, NULL));
        if (r == 0) {
                test_close_all_fds_inner(close_all_fds_without_malloc);
                _exit(EXIT_SUCCESS);
        }

-        r = ASSERT_OK(pidref_safe_fork("(caf-frugal)", flags, NULL));
+        ASSERT_OK(r = pidref_safe_fork("(caf-proc)", flags, NULL));
+        if (r == 0) {
+                test_close_all_fds_inner(close_all_fds_by_proc);
+                _exit(EXIT_SUCCESS);
+        }
+
+        ASSERT_OK(r = pidref_safe_fork("(caf-frugal)", flags, NULL));
        if (r == 0) {
                test_close_all_fds_inner(close_all_fds_frugal);
                _exit(EXIT_SUCCESS);
--- a/src/test/test-mountpoint-util.c
+++ b/src/test/test-mountpoint-util.c
@ -283,16 +283,18 @@ TEST(is_mount_point_at) {
        fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);
        assert_se(fd >= 0);

-        ASSERT_OK_POSITIVE(is_mount_point_at(fd, "/", /* flags= */ 0));
-        ASSERT_OK_POSITIVE(is_mount_point_at(fd, "..", /* flags= */ 0));
-        ASSERT_OK_POSITIVE(is_mount_point_at(fd, "../", /* flags= */ 0));
-        r = ASSERT_OK(proc_mounted());
-        ASSERT_OK_EQ(is_mount_point_at(fd, "/proc", /* flags= */ 0), r);
-        ASSERT_OK_EQ(is_mount_point_at(fd, "/proc/", /* flags= */ 0), r);
-        ASSERT_OK_EQ(is_mount_point_at(fd, "proc", /* flags= */ 0), r);
-        ASSERT_OK_EQ(is_mount_point_at(fd, "proc/", /* flags= */ 0), r);
-        ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
-        ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
+        /* Not allowed, since "/" is a path, not a plain filename */
+        assert_se(is_mount_point_at(fd, "/", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "..", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "../", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "/proc", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "/proc/", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "proc/sys", 0) == -EINVAL);
+        assert_se(is_mount_point_at(fd, "proc/sys/", 0) == -EINVAL);
+
+        /* This one definitely is a mount point */
+        assert_se(is_mount_point_at(fd, "proc", 0) > 0);
+        assert_se(is_mount_point_at(fd, "proc/", 0) > 0);

        safe_close(fd);
        fd = open("/tmp", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);
--- a/src/tmpfiles/tmpfiles.c
+++ b/src/tmpfiles/tmpfiles.c
@ -559,7 +559,7 @@ static int opendir_and_stat(
                bool *ret_mountpoint) {

        _cleanup_closedir_ DIR *d = NULL;
-        struct statx sx;
+        struct statx sx1;
        int r;

        assert(path);
@ -586,16 +586,21 @@ static int opendir_and_stat(
                return 0;
        }

-        if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx) < 0)
+        if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx1) < 0)
                return log_error_errno(errno, "statx(%s) failed: %m", path);

-        r = statx_warn_mount_root(&sx, LOG_ERR);
-        if (r < 0)
-                return r;
+        if (FLAGS_SET(sx1.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
+                *ret_mountpoint = FLAGS_SET(sx1.stx_attributes, STATX_ATTR_MOUNT_ROOT);
+        else {
+                struct statx sx2;
+                if (statx(dirfd(d), "..", 0, STATX_INO, &sx2) < 0)
+                        return log_error_errno(errno, "statx(%s/..) failed: %m", path);
+
+                *ret_mountpoint = !statx_mount_same(&sx1, &sx2);
+        }

-        *ret_mountpoint = FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
        *ret = TAKE_PTR(d);
-        *ret_sx = sx;
+        *ret_sx = sx1;
        return 1;
 }

@ -708,13 +713,35 @@ static int dir_cleanup(
                        continue;
                }

-                r = statx_warn_mount_root(&sx, LOG_ERR);
-                if (r < 0)
-                        return r;
+                if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
+                        /* Yay, we have the mount point API, use it */
+                        if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
+                                log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name);
+                                continue;
+                        }
+                } else {
+                        /* So we might have statx() but the STATX_ATTR_MOUNT_ROOT flag is not supported, fall
+                         * back to traditional stx_dev checking. */
+                        if (sx.stx_dev_major != rootdev_major ||
+                            sx.stx_dev_minor != rootdev_minor) {
+                                log_debug("Ignoring \"%s/%s\": different filesystem.", p, de->d_name);
+                                continue;
+                        }

-                if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
-                        log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name);
-                        continue;
+                        /* Try to detect bind mounts of the same filesystem instance; they do not differ in
+                         * device major/minors. This type of query is not supported on all kernels or
+                         * filesystem types though. */
+                        if (S_ISDIR(sx.stx_mode)) {
+                                int q;
+
+                                q = is_mount_point_at(dirfd(d), de->d_name, 0);
+                                if (q < 0)
+                                        log_debug_errno(q, "Failed to determine whether \"%s/%s\" is a mount point, ignoring: %m", p, de->d_name);
+                                else if (q > 0) {
+                                        log_debug("Ignoring \"%s/%s\": different mount of the same filesystem.", p, de->d_name);
+                                        continue;
+                                }
+                        }
                }

                atime_nsec = FLAGS_SET(sx.stx_mask, STATX_ATIME) ? statx_timestamp_load_nsec(&sx.stx_atime) : 0;
--- a/src/validatefs/validatefs.c
+++ b/src/validatefs/validatefs.c
@ -426,7 +426,7 @@ static int run(int argc, char *argv[]) {
        if (target_fd < 0)
                return log_error_errno(target_fd, "Failed to open directory '%s': %m", arg_target);

-        r = is_mount_point_at(target_fd, /* path= */ NULL, /* flags= */ 0);
+        r = is_mount_point_at(target_fd, /* filename= */ NULL, /* flags= */ 0);
        if (r < 0)
                return log_error_errno(r, "Failed to determine whether '%s' is a mount point: %m", resolved);
        if (!r)